From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:15 +0100 Subject: signal: re-add dead task accumulation stats. We're going to split the process wide cpu accounting into two parts: - clocks; which can take all the time they want since they run from user context. - timers; which need constant time tracing but can affort the overhead because they're default off -- and rare. The clock readout will go back to a full sum of the thread group, for this we need to re-add the exit stats that were removed in the initial itimer rework (f06febc9: timers: fix itimer/many thread hang). Furthermore, since that full sum can be rather slow for large thread groups and we have the complete dead task stats, revert the do_notify_parent time computation. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2127e959e0f4..2e0646a30314 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -559,7 +559,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ - cputime_t cutime, cstime; + cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@ -567,6 +567,14 @@ struct signal_struct { unsigned long inblock, oublock, cinblock, coublock; struct task_io_accounting ioac; + /* + * Cumulative ns of schedule CPU time fo dead threads in the + * group, not including a zombie group leader, (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sum_sched_runtime; + /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs -- cgit v1.2.3 From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:16 +0100 Subject: timers: split process wide cpu clocks/timers Change the process wide cpu timers/clocks so that we: 1) don't mess up the kernel with too many threads, 2) don't have a per-cpu allocation for each process, 3) have no impact when not used. In order to accomplish this we're going to split it into two parts: - clocks; which can take all the time they want since they run from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID) - timers; which need constant time sampling but since they're explicity used, the user can pay the overhead. The clock readout will go back to a full sum of the thread group, while the timers will run of a global 'clock' that only runs when needed, so only programs that make use of the facility pay the price. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 11 +++--- include/linux/sched.h | 54 +++++++++++++++------------ kernel/itimer.c | 4 +- kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched_stats.h | 45 ++++++++++++---------- 5 files changed, 155 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index ea0ea1a4c36f..e752d973fa21 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -48,12 +48,11 @@ extern struct fs_struct init_fs; .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ - .cputime = { .totals = { \ - .utime = cputime_zero, \ - .stime = cputime_zero, \ - .sum_exec_runtime = 0, \ - .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ - }, }, \ + .cputimer = { \ + .cputime = INIT_CPUTIME, \ + .running = 0, \ + .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + }, \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2e0646a30314..082d7619b3a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -443,7 +443,6 @@ struct pacct_struct { * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds - * @lock: lock for fields in this struct * * This structure groups together three kinds of CPU time that are * tracked for threads and thread groups. Most things considering @@ -454,23 +453,33 @@ struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; - spinlock_t lock; }; /* Alternate field names when used to cache expirations. */ #define prof_exp stime #define virt_exp utime #define sched_exp sum_exec_runtime +#define INIT_CPUTIME \ + (struct task_cputime) { \ + .utime = cputime_zero, \ + .stime = cputime_zero, \ + .sum_exec_runtime = 0, \ + } + /** - * struct thread_group_cputime - thread group interval timer counts - * @totals: thread group interval timers; substructure for - * uniprocessor kernel, per-cpu for SMP kernel. + * struct thread_group_cputimer - thread group interval timer counts + * @cputime: thread group interval timers. + * @running: non-zero when there are timers running and + * @cputime receives updates. + * @lock: lock for fields in this struct. * * This structure contains the version of task_cputime, above, that is - * used for thread group CPU clock calculations. + * used for thread group CPU timer calculations. */ -struct thread_group_cputime { - struct task_cputime totals; +struct thread_group_cputimer { + struct task_cputime cputime; + int running; + spinlock_t lock; }; /* @@ -519,10 +528,10 @@ struct signal_struct { cputime_t it_prof_incr, it_virt_incr; /* - * Thread group totals for process CPU clocks. - * See thread_group_cputime(), et al, for details. + * Thread group totals for process CPU timers. + * See thread_group_cputimer(), et al, for details. */ - struct thread_group_cputime cputime; + struct thread_group_cputimer cputimer; /* Earliest-expiration cache. */ struct task_cputime cputime_expires; @@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock) /* * Thread group CPU time accounting. */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); static inline -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) { - struct task_cputime *totals = &tsk->signal->cputime.totals; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; unsigned long flags; - spin_lock_irqsave(&totals->lock, flags); - *times = *totals; - spin_unlock_irqrestore(&totals->lock, flags); + WARN_ON(!cputimer->running); + + spin_lock_irqsave(&cputimer->lock, flags); + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); } static inline void thread_group_cputime_init(struct signal_struct *sig) { - sig->cputime.totals = (struct task_cputime){ - .utime = cputime_zero, - .stime = cputime_zero, - .sum_exec_runtime = 0, - }; - - spin_lock_init(&sig->cputime.totals.lock); + sig->cputimer.cputime = INIT_CPUTIME; + spin_lock_init(&sig->cputimer.lock); + sig->cputimer.running = 0; } static inline void thread_group_cputime_free(struct signal_struct *sig) diff --git a/kernel/itimer.c b/kernel/itimer.c index 6a5fe93dd8bd..58762f7077ec 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime cputime; cputime_t utime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); @@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime times; cputime_t ptime; - thread_group_cputime(tsk, ×); + thread_group_cputimer(tsk, ×); ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index fa07da94d7be..db107c9bbc05 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct sighand_struct *sighand; + struct signal_struct *sig; + struct task_struct *t; + + *times = INIT_CPUTIME; + + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + if (!sighand) + goto out; + + sig = tsk->signal; + + t = tsk; + do { + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); + times->sum_exec_runtime += t->se.sum_exec_runtime; + + t = next_thread(t); + } while (t != tsk); + + times->utime = cputime_add(times->utime, sig->utime); + times->stime = cputime_add(times->stime, sig->stime); + times->sum_exec_runtime += sig->sum_sched_runtime; +out: + rcu_read_unlock(); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. @@ -475,6 +506,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +/* + * Enable the process wide cpu timer accounting. + * + * serialized using ->sighand->siglock + */ +static void start_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 1; + barrier(); +} + +/* + * Release the process wide timer accounting -- timer stops ticking when + * nobody cares about it. + * + * serialized using ->sighand->siglock + */ +static void stop_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 0; + barrier(); +} + /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); + if (!CPUCLOCK_PERTHREAD(timer->it_clock)) + start_process_timers(p); + listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk, sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it_virt_expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) + list_empty(&timers[CPUCLOCK_SCHED])) { + stop_process_timers(tsk); return; + } /* * Collect the current process totals. */ - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; ptime = cputime_add(utime, cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; @@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; - thread_group_cputime(tsk, &group_sample); + thread_group_cputimer(tsk, &group_sample); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1328,6 +1387,33 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. @@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group(clock_idx, tsk, &now); + start_process_timers(tsk); + cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8ab0cef8ecab..a8f93dd374e1 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; - spin_lock(×->lock); - times->utime = cputime_add(times->utime, cputime); - spin_unlock(×->lock); + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.utime = + cputime_add(cputimer->cputime.utime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->stime = cputime_add(times->stime, cputime); - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.stime = + cputime_add(cputimer->cputime.stime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct task_cputime *times; + struct thread_group_cputimer *cputimer; struct signal_struct *sig; sig = tsk->signal; @@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - times = &sig->cputime.totals; + cputimer = &sig->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->sum_exec_runtime += ns; - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + spin_unlock(&cputimer->lock); } -- cgit v1.2.3 From 7d8e23df69820e6be42bcc41d441f4860e8c76f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Feb 2009 14:57:51 +0100 Subject: timers: split process wide cpu clocks/timers, remove spurious warning Mike Galbraith reported that the new warning in thread_group_cputimer() triggers en masse with Amarok running. Oleg Nesterov observed: Can't fastpath_timer_check()->thread_group_cputimer() have the false warning too? Suppose we had the timer, then posix_cpu_timer_del() removes this timer, but task_cputime_zero(&sig->cputime_expires) still not true. Remove the spurious debug warning. Reported-by: Mike Galbraith Explained-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 082d7619b3a1..79392916d6c9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2208,8 +2208,6 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; unsigned long flags; - WARN_ON(!cputimer->running); - spin_lock_irqsave(&cputimer->lock, flags); *times = cputimer->cputime; spin_unlock_irqrestore(&cputimer->lock, flags); -- cgit v1.2.3 From 43a990765a9e874350bae1009366d00809dbc9d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Feb 2009 00:00:22 +0100 Subject: sound: Remove OSSlib stuff from linux/soundcard.h Removed OSSlib stuff from linux/soundcard.h to fix the warnings for 'make headers_check'. This patch breaks building against OSSlib with the kernel headers instead of its own headers. It should still work with any version of the library from the 2003 onwards which provide their own headers for the latest interface. Signed-off-by: Arnd Bergmann Cc: Jaswinder Singh Rajput Signed-off-by: Takashi Iwai --- include/linux/soundcard.h | 74 +++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/soundcard.h b/include/linux/soundcard.h index 523d069c862c..1904afedb82f 100644 --- a/include/linux/soundcard.h +++ b/include/linux/soundcard.h @@ -1045,50 +1045,36 @@ typedef struct mixer_vol_table { */ #define LOCL_STARTAUDIO 1 -#if (!defined(__KERNEL__) && !defined(KERNEL) && !defined(INKERNEL) && !defined(_KERNEL)) || defined(USE_SEQ_MACROS) +#if !defined(__KERNEL__) || defined(USE_SEQ_MACROS) /* * Some convenience macros to simplify programming of the * /dev/sequencer interface * - * These macros define the API which should be used when possible. + * This is a legacy interface for applications written against + * the OSSlib-3.8 style interface. It is no longer possible + * to actually link against OSSlib with this header, but we + * still provide these macros for programs using them. + * + * If you want to use OSSlib, it is recommended that you get + * the GPL version of OSS-4.x and build against that version + * of the header. + * + * We redefine the extern keyword so that make headers_check + * does not complain about SEQ_USE_EXTBUF. */ #define SEQ_DECLAREBUF() SEQ_USE_EXTBUF() void seqbuf_dump(void); /* This function must be provided by programs */ -extern int OSS_init(int seqfd, int buflen); -extern void OSS_seqbuf_dump(int fd, unsigned char *buf, int buflen); -extern void OSS_seq_advbuf(int len, int fd, unsigned char *buf, int buflen); -extern void OSS_seq_needbuf(int len, int fd, unsigned char *buf, int buflen); -extern void OSS_patch_caching(int dev, int chn, int patch, - int fd, unsigned char *buf, int buflen); -extern void OSS_drum_caching(int dev, int chn, int patch, - int fd, unsigned char *buf, int buflen); -extern void OSS_write_patch(int fd, unsigned char *buf, int len); -extern int OSS_write_patch2(int fd, unsigned char *buf, int len); - #define SEQ_PM_DEFINES int __foo_bar___ -#ifdef OSSLIB -# define SEQ_USE_EXTBUF() \ - extern unsigned char *_seqbuf; \ - extern int _seqbuflen;extern int _seqbufptr -# define SEQ_DEFINEBUF(len) SEQ_USE_EXTBUF();static int _requested_seqbuflen=len -# define _SEQ_ADVBUF(len) OSS_seq_advbuf(len, seqfd, _seqbuf, _seqbuflen) -# define _SEQ_NEEDBUF(len) OSS_seq_needbuf(len, seqfd, _seqbuf, _seqbuflen) -# define SEQ_DUMPBUF() OSS_seqbuf_dump(seqfd, _seqbuf, _seqbuflen) - -# define SEQ_LOAD_GMINSTR(dev, instr) \ - OSS_patch_caching(dev, -1, instr, seqfd, _seqbuf, _seqbuflen) -# define SEQ_LOAD_GMDRUM(dev, drum) \ - OSS_drum_caching(dev, -1, drum, seqfd, _seqbuf, _seqbuflen) -#else /* !OSSLIB */ - -# define SEQ_LOAD_GMINSTR(dev, instr) -# define SEQ_LOAD_GMDRUM(dev, drum) - -# define SEQ_USE_EXTBUF() \ - extern unsigned char _seqbuf[]; \ - extern int _seqbuflen;extern int _seqbufptr + +#define SEQ_LOAD_GMINSTR(dev, instr) +#define SEQ_LOAD_GMDRUM(dev, drum) + +#define _SEQ_EXTERN extern +#define SEQ_USE_EXTBUF() \ + _SEQ_EXTERN unsigned char _seqbuf[]; \ + _SEQ_EXTERN int _seqbuflen; _SEQ_EXTERN int _seqbufptr #ifndef USE_SIMPLE_MACROS /* Sample seqbuf_dump() implementation: @@ -1131,7 +1117,6 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len); */ #define _SEQ_NEEDBUF(len) /* empty */ #endif -#endif /* !OSSLIB */ #define SEQ_VOLUME_MODE(dev, mode) {_SEQ_NEEDBUF(8);\ _seqbuf[_seqbufptr] = SEQ_EXTENDED;\ @@ -1215,14 +1200,8 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len); _CHN_COMMON(dev, MIDI_CHN_PRESSURE, chn, pressure, 0, 0) #define SEQ_SET_PATCH SEQ_PGM_CHANGE -#ifdef OSSLIB -# define SEQ_PGM_CHANGE(dev, chn, patch) \ - {OSS_patch_caching(dev, chn, patch, seqfd, _seqbuf, _seqbuflen); \ - _CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0);} -#else -# define SEQ_PGM_CHANGE(dev, chn, patch) \ +#define SEQ_PGM_CHANGE(dev, chn, patch) \ _CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0) -#endif #define SEQ_CONTROL(dev, chn, controller, value) \ _CHN_COMMON(dev, MIDI_CTL_CHANGE, chn, controller, 0, value) @@ -1300,19 +1279,12 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len); /* * Patch loading. */ -#ifdef OSSLIB -# define SEQ_WRPATCH(patchx, len) \ - OSS_write_patch(seqfd, (char*)(patchx), len) -# define SEQ_WRPATCH2(patchx, len) \ - OSS_write_patch2(seqfd, (char*)(patchx), len) -#else -# define SEQ_WRPATCH(patchx, len) \ +#define SEQ_WRPATCH(patchx, len) \ {if (_seqbufptr) SEQ_DUMPBUF();\ if (write(seqfd, (char*)(patchx), len)==-1) \ perror("Write patch: /dev/sequencer");} -# define SEQ_WRPATCH2(patchx, len) \ +#define SEQ_WRPATCH2(patchx, len) \ (SEQ_DUMPBUF(), write(seqfd, (char*)(patchx), len)) -#endif #endif #endif -- cgit v1.2.3 From 7f5aa215088b817add9c71914b83650bdd49f8a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 11:15:34 -0500 Subject: jbd2: Avoid possible NULL dereference in jbd2_journal_begin_ordered_truncate() If we race with commit code setting i_transaction to NULL, we could possibly dereference it. Proper locking requires the journal pointer (to access journal->j_list_lock), which we don't have. So we have to change the prototype of the function so that filesystem passes us the journal pointer. Also add a more detailed comment about why the function jbd2_journal_begin_ordered_truncate() does what it does and how it should be used. Thanks to Dan Carpenter for pointing to the suspitious code. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Acked-by: Joel Becker CC: linux-ext4@vger.kernel.org CC: ocfs2-devel@oss.oracle.com CC: mfasheh@suse.de CC: Dan Carpenter --- fs/ext4/inode.c | 6 ++++-- fs/jbd2/transaction.c | 42 +++++++++++++++++++++++++++++++----------- fs/ocfs2/journal.h | 6 ++++-- include/linux/jbd2.h | 3 ++- 4 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03ba20be1329..658c4a7f2578 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,8 +47,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 46b4e347ed7d..28ce21d8598e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2129,26 +2129,46 @@ done: } /* - * This function must be called when inode is journaled in ordered mode - * before truncation happens. It starts writeout of truncated part in - * case it is in the committing transaction so that we stand to ordered - * mode consistency guarantees. + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). */ -int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, loff_t new_size) { - journal_t *journal; - transaction_t *commit_trans; + transaction_t *inode_trans, *commit_trans; int ret = 0; - if (!inode->i_transaction && !inode->i_next_transaction) + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) goto out; - journal = inode->i_transaction->t_journal; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ spin_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; spin_unlock(&journal->j_state_lock); - if (inode->i_transaction == commit_trans) { - ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3c3532e1307c..172850a9a12a 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) static inline int ocfs2_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); } #endif /* OCFS2_JOURNAL_H */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b28b37eb11c6..4d248b3f1323 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1150,7 +1150,8 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); +extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); -- cgit v1.2.3 From e672f7db767156bf71adf9c592cfe81b339523d6 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Tue, 10 Feb 2009 17:18:17 -0800 Subject: pkt_sched: type should be __u32 in header Using u32 in this header breaks the build of iptables. Signed-off-by: Chuck Ebbert Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index b2648e8e4987..d51a2b3e221e 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -515,7 +515,7 @@ enum struct tc_drr_stats { - u32 deficit; + __u32 deficit; }; #endif -- cgit v1.2.3 From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Feb 2009 16:37:31 +0100 Subject: timers: split process wide cpu clocks/timers, fix To decrease the chance of a missed enable, always enable the timer when we sample it, we'll always disable it when we find that there are no active timers in the jiffy tick. This fixes a flood of warnings reported by Mike Galbraith. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/posix-cpu-timers.c | 42 ++++++++++++++---------------------------- 2 files changed, 15 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 79392916d6c9..5d10fa0b6002 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2209,6 +2209,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) unsigned long flags; spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; *times = cputimer->cputime; spin_unlock_irqrestore(&cputimer->lock, flags); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index db107c9bbc05..e5d7bfdfa7d4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct task_cputime cputime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, cputime.utime, cputime.stime, cputime.sum_exec_runtime); } @@ -506,29 +506,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } -/* - * Enable the process wide cpu timer accounting. - * - * serialized using ->sighand->siglock - */ -static void start_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 1; - barrier(); -} - -/* - * Release the process wide timer accounting -- timer stops ticking when - * nobody cares about it. - * - * serialized using ->sighand->siglock - */ -static void stop_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 0; - barrier(); -} - /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); - if (!CPUCLOCK_PERTHREAD(timer->it_clock)) - start_process_timers(p); - listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk, } } +static void stop_process_timers(struct task_struct *tsk) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + unsigned long flags; + + if (!cputimer->running) + return; + + spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - start_process_timers(tsk); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { -- cgit v1.2.3 From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 11:30:27 +0100 Subject: timers: fix TIMER_ABSTIME for process wide cpu timers The POSIX timer interface allows for absolute time expiry values through the TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock every time we start it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +------------ kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5d10fa0b6002..8981e52c714f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2201,18 +2201,7 @@ static inline int spin_needbreak(spinlock_t *lock) * Thread group CPU time accounting. */ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); - -static inline -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - unsigned long flags; - - spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); -} +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); static inline void thread_group_cputime_init(struct signal_struct *sig) { diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e5d7bfdfa7d4..2313a4cc14ea 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -261,6 +261,40 @@ out: rcu_read_unlock(); } +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (cputime_gt(b->utime, a->utime)) + a->utime = b->utime; + + if (cputime_gt(b->stime, a->stime)) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + spin_lock_irqsave(&cputimer->lock, flags); + if (!cputimer->running) { + cputimer->running = 1; + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + update_gt_cputime(&cputimer->cputime, &sum); + } + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. -- cgit v1.2.3 From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 11 Feb 2009 15:10:27 +0100 Subject: x86, ptrace, mm: fix double-free on race Ptrace_detach() races with __ptrace_unlink() if the traced task is reaped while detaching. This might cause a double-free of the BTS buffer. Change the ptrace_detach() path to only do the memory accounting in ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace() which will be called from __ptrace_unlink(). The fix follows a proposal from Oleg Nesterov. Reported-by: Oleg Nesterov Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 16 ++++++++++------ include/linux/mm.h | 1 + mm/mlock.c | 7 ++++++- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a5df5f82fb9..5a4c23d89892 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -810,12 +810,16 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - if (unlikely(child->bts)) { - ds_release_bts(child->bts); - child->bts = NULL; - - ptrace_bts_free_buffer(child); - } + /* + * Ptrace_detach() races with ptrace_untrace() in case + * the child dies and is reaped by another thread. + * + * We only do the memory accounting at this point and + * leave the buffer deallocation and the bts tracer + * release to ptrace_bts_untrace() which will be called + * later on with tasklist_lock held. + */ + release_locked_buffer(child->bts_buffer, child->bts_size); } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index e8ddc98b8405..3d7fb44d7d7e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1305,5 +1305,6 @@ void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); extern void free_locked_buffer(void *buffer, size_t size); +extern void release_locked_buffer(void *buffer, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 028ec482fdd4..2b57f7e60390 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void free_locked_buffer(void *buffer, size_t size) +void release_locked_buffer(void *buffer, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size) current->mm->locked_vm -= pgsz; up_write(¤t->mm->mmap_sem); +} + +void free_locked_buffer(void *buffer, size_t size) +{ + release_locked_buffer(buffer, size); kfree(buffer); } -- cgit v1.2.3 From cfebe563bd0a3ff97e1bc167123120d59c7a84db Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 11 Feb 2009 13:04:36 -0800 Subject: cgroups: fix lockdep subclasses overflow I enabled all cgroup subsystems when compiling kernel, and then: # mount -t cgroup -o net_cls xxx /mnt # mkdir /mnt/0 This showed up immediately: BUG: MAX_LOCKDEP_SUBCLASSES too low! turning off the locking correctness validator. It's caused by the cgroup hierarchy lock: for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) mutex_lock_nested(&ss->hierarchy_mutex, i); } Now we have 9 cgroup subsystems, and the above 'i' for net_cls is 8, but MAX_LOCKDEP_SUBCLASSES is 8. This patch uses different lockdep keys for different subsystems. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e4e8e117d27d..499900d0cee7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -378,6 +378,7 @@ struct cgroup_subsys { * - initiating hotplug events */ struct mutex hierarchy_mutex; + struct lock_class_key subsys_key; /* * Link to parent, and list entry in parent's children. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a54ff42874e..e14db9c089b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2351,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) - mutex_lock_nested(&ss->hierarchy_mutex, i); + mutex_lock(&ss->hierarchy_mutex); } } @@ -2637,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; } -- cgit v1.2.3 From 6c5979631b4b03c9288776562c18036765e398c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Feb 2009 13:04:38 -0800 Subject: syscall define: fix uml compile bug With the new system call defines we get this on uml: arch/um/sys-i386/built-in.o: In function `sys_call_table': (.rodata+0x308): undefined reference to `sys_sigprocmask' Reason for this is that uml passes the preprocessor option -Dsigprocmask=kernel_sigprocmask to gcc when compiling the kernel. This causes SYSCALL_DEFINE3(sigprocmask, ...) to be expanded to SYSCALL_DEFINEx(3, kernel_sigprocmask, ...) and finally to a system call named sys_kernel_sigprocmask. However sys_sigprocmask is missing because of this. To avoid macro expansion for the system call name just concatenate the name at first define instead of carrying it through severel levels. This was pointed out by Al Viro. Signed-off-by: Heiko Carstens Cc: Geert Uytterhoeven Cc: Al Viro Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0eda02ff2414..f9f900cfd066 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -95,13 +95,13 @@ struct old_linux_dirent; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) -#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) -#define SYSCALL_DEFINE1(...) SYSCALL_DEFINEx(1, __VA_ARGS__) -#define SYSCALL_DEFINE2(...) SYSCALL_DEFINEx(2, __VA_ARGS__) -#define SYSCALL_DEFINE3(...) SYSCALL_DEFINEx(3, __VA_ARGS__) -#define SYSCALL_DEFINE4(...) SYSCALL_DEFINEx(4, __VA_ARGS__) -#define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__) -#define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__) +#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) +#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) #ifdef CONFIG_PPC64 #define SYSCALL_ALIAS(alias, name) \ @@ -121,21 +121,21 @@ struct old_linux_dirent; #define SYSCALL_DEFINE(name) static inline long SYSC_##name #define SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)); \ - static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)); \ - asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__)) \ + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ + static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ + asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ { \ __SC_TEST##x(__VA_ARGS__); \ - return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__)); \ + return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \ } \ - SYSCALL_ALIAS(sys_##name, SyS_##name); \ - static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)) + SYSCALL_ALIAS(sys##name, SyS##name); \ + static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ #define SYSCALL_DEFINE(name) asmlinkage long sys_##name #define SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)) + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ -- cgit v1.2.3 From 99709372736a216f99eb32b76fba835a2bfc93a8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 12 Feb 2009 16:43:17 -0800 Subject: net: don't use in_atomic() in gfp_any() The problem is that in_atomic() will return false inside spinlocks if CONFIG_PREEMPT=n. This will lead to deadlockable GFP_KERNEL allocations from spinlocked regions. Secondly, if CONFIG_PREEMPT=y, this bug solves itself because networking will instead use GFP_ATOMIC from this callsite. Hence we won't get the might_sleep() debugging warnings which would have informed us of the buggy callsites. Solve both these problems by switching to in_interrupt(). Now, if someone runs a gfp_any() allocation from inside spinlock we will get the warning if CONFIG_PREEMPT=y. I reviewed all callsites and most of them were too complex for my little brain and none of them documented their interface requirements. I have no idea what this patch will do. Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 5a3a151bd730..ce3b5b622683 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1308,7 +1308,7 @@ static inline int sock_writeable(const struct sock *sk) static inline gfp_t gfp_any(void) { - return in_atomic() ? GFP_ATOMIC : GFP_KERNEL; + return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, int noblock) -- cgit v1.2.3 From 7a0eb1960e8ddcb68ea631caf16815485af0e228 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 19 Jan 2009 14:57:52 +0200 Subject: KVM: Avoid using CONFIG_ in userspace visible headers Kconfig symbols are not available in userspace, and are not stripped by headers-install. Avoid their use by adding #defines in to suit each architecture. Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm.h | 4 ++++ arch/x86/include/asm/kvm.h | 7 +++++++ include/linux/kvm.h | 10 +++++----- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index 68aa6da807c1..bfa86b6af7cd 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -25,6 +25,10 @@ #include +/* Select x86 specific features in */ +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index d2e3bf3608af..886c9402ec45 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -9,6 +9,13 @@ #include #include +/* Select x86 specific features in */ +#define __KVM_HAVE_PIT +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT +#define __KVM_HAVE_MSI +#define __KVM_HAVE_USER_NMI + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 5715f1907601..0424326f1679 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -58,10 +58,10 @@ struct kvm_irqchip { __u32 pad; union { char dummy[512]; /* reserving space */ -#ifdef CONFIG_X86 +#ifdef __KVM_HAVE_PIT struct kvm_pic_state pic; #endif -#if defined(CONFIG_X86) || defined(CONFIG_IA64) +#ifdef __KVM_HAVE_IOAPIC struct kvm_ioapic_state ioapic; #endif } chip; @@ -384,16 +384,16 @@ struct kvm_trace_rec { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ -#if defined(CONFIG_X86)||defined(CONFIG_IA64) +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_ASSIGNMENT 17 #endif #define KVM_CAP_IOMMU 18 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_MSI #define KVM_CAP_DEVICE_MSI 20 #endif /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 #endif -- cgit v1.2.3 From ad8ba2cd44d4d39fb3fe55d5dcc565b19fc3a7fb Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 6 Jan 2009 10:03:02 +0800 Subject: KVM: Add kvm_arch_sync_events to sync with asynchronize events kvm_arch_sync_events is introduced to quiet down all other events may happen contemporary with VM destroy process, like IRQ handler and work struct for assigned device. For kvm_arch_sync_events is called at the very beginning of kvm_destroy_vm(), so the state of KVM here is legal and can provide a environment to quiet down other events. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 4 ++++ arch/powerpc/kvm/powerpc.c | 4 ++++ arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 4 ++++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 1 + 6 files changed, 18 insertions(+) (limited to 'include') diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 4e586f6110aa..28f982045f29 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1337,6 +1337,10 @@ static void kvm_release_vm_pages(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2822c8ccfaaf..5f81256287f5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -125,6 +125,10 @@ static void kvmppc_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvmppc_free_vcpus(kvm); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index be8497186b96..0d33893e1e89 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -212,6 +212,10 @@ static void kvm_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_vcpus(kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cc17546a2406..b0fc079f1bee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4127,6 +4127,10 @@ static void kvm_free_vcpus(struct kvm *kvm) } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ec49d0be7f52..bf6f703642fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ void kvm_free_physmem(struct kvm *kvm); struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); +void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0b6f2f71271f..68e3f1ec1674 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -891,6 +891,7 @@ static void kvm_destroy_vm(struct kvm *kvm) { struct mm_struct *mm = kvm->mm; + kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); -- cgit v1.2.3 From 2f6097129af5625db2fb5e601f89e5bf55cd1dcd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 16 Feb 2009 12:49:16 +0000 Subject: FRV: __pte_to_swp_entry doesn't expand correctly The macro doesn't expand correctly when its parameter isn't 'pte'. Signed-off-by: Roel Kluin Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/asm-frv/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index 83c51aba534b..e16fdb1f4f4f 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -478,7 +478,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_type(x) (((x).val >> 2) & 0x1f) #define __swp_offset(x) ((x).val >> 8) #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 2) | ((offset) << 8) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte }) +#define __pte_to_swp_entry(_pte) ((swp_entry_t) { (_pte).pte }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) static inline int pte_file(pte_t pte) -- cgit v1.2.3 From 5955c7a2cfb6a35429adea5dc480002b15ca8cfc Mon Sep 17 00:00:00 2001 From: Zlatko Calusic Date: Wed, 18 Feb 2009 01:33:34 +0100 Subject: Add support for VT6415 PCIE PATA IDE Host Controller Signed-off-by: Zlatko Calusic Signed-off-by: Linus Torvalds --- drivers/ata/pata_via.c | 4 +++- include/linux/pci_ids.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index 79a6c9a0b721..ba556d3e6963 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -110,7 +110,8 @@ static const struct via_isa_bridge { { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA }, - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES}, + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, + { "vt6415", PCI_DEVICE_ID_VIA_6415, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, @@ -593,6 +594,7 @@ static int via_reinit_one(struct pci_dev *pdev) #endif static const struct pci_device_id via[] = { + { PCI_VDEVICE(VIA, 0x0415), }, { PCI_VDEVICE(VIA, 0x0571), }, { PCI_VDEVICE(VIA, 0x0581), }, { PCI_VDEVICE(VIA, 0x1571), }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 52a9fe08451c..918391b4b109 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1312,6 +1312,7 @@ #define PCI_DEVICE_ID_VIA_VT3351 0x0351 #define PCI_DEVICE_ID_VIA_VT3364 0x0364 #define PCI_DEVICE_ID_VIA_8371_0 0x0391 +#define PCI_DEVICE_ID_VIA_6415 0x0415 #define PCI_DEVICE_ID_VIA_8501_0 0x0501 #define PCI_DEVICE_ID_VIA_82C561 0x0561 #define PCI_DEVICE_ID_VIA_82C586_1 0x0571 -- cgit v1.2.3