From cff2b760096d1e6feaa31948e7af4abbefe47822 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 11 Feb 2006 17:55:47 -0800 Subject: [PATCH] fstatat64 support The *at patches introduced fstatat and, due to inusfficient research, I used the newfstat functions generally as the guideline. The result is that on 32-bit platforms we don't have all the information needed to implement fstatat64. This patch modifies the code to pass up 64-bit information if __ARCH_WANT_STAT64 is defined. I renamed the syscall entry point to make this clear. Other archs will continue to use the existing code. On x86-64 the compat code is implemented using a new sys32_ function. this is what is done for the other stat syscalls as well. This patch might break some other archs (those which define __ARCH_WANT_STAT64 and which already wired up the syscall). Yet others might need changes to accomodate the compatibility mode. I really don't want to do that work because all this stat handling is a mess (more so in glibc, but the kernel is also affected). It should be done by the arch maintainers. I'll provide some stand-alone test shortly. Those who are eager could compile glibc and run 'make check' (no installation needed). The patch below has been tested on x86 and x86-64. Signed-off-by: Ulrich Drepper Cc: Christoph Hellwig Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3877209d23c3..d73501ba7e44 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -557,6 +557,8 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode); asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag); +asmlinkage long sys_fstatat64(int dfd, char __user *filename, + struct stat64 __user *statbuf, int flag); asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf, int bufsiz); asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, -- cgit v1.2.3 From 643a654540579b0dcc7a206a4a7475276a41aff0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 11 Feb 2006 17:55:52 -0800 Subject: [PATCH] select: fix returned timeval With David Woodhouse select() presently has a habit of increasing the value of the user's `timeout' argument on return. We were writing back a timeout larger than the original. We _deliberately_ round up, since we know we must wait at _least_ as long as the caller asks us to. The patch adds a couple of helper functions for magnitude comparison of timespecs and of timevals, and uses them to prevent the various poll and select functions from returning a timeout which is larger than the one which was passed in. The patch also fixes a bug in compat_sys_pselect7(): it was adding the new timeout value to the old one and was returning that. It should just return the new timeout value. (We have various handy timespec/timeval-to-from-nsec conversion functions in time.h. But this code open-codes it all). Cc: "David S. Miller" Cc: Andi Kleen Cc: Ulrich Drepper Cc: Thomas Gleixner Cc: george anzinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 37 +++++++++++++++++++++++++------------ fs/select.c | 32 +++++++++++++++++++++++--------- include/linux/compat.h | 20 ++++++++++++++++++++ include/linux/time.h | 25 ++++++++++++++++++++++++- 4 files changed, 92 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/compat.c b/fs/compat.c index 70c5af4cc270..a2ba78bdf7f7 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1751,11 +1751,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, ret = compat_core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct compat_timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (compat_timeval_compare(&rtv, &tv) < 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -1822,13 +1826,17 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); if (tsp && !(current->personality & STICKY_TIMEOUTS)) { - ts.tv_sec += timeout / HZ; - ts.tv_nsec += (timeout % HZ) * (1000000000/HZ); - if (ts.tv_nsec >= 1000000000) { - ts.tv_sec++; - ts.tv_nsec -= 1000000000; + struct compat_timespec rts; + + rts.tv_sec = timeout / HZ; + rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ); + if (rts.tv_nsec >= NSEC_PER_SEC) { + rts.tv_sec++; + rts.tv_nsec -= NSEC_PER_SEC; } - (void)copy_to_user(tsp, &ts, sizeof(ts)); + if (compat_timespec_compare(&rts, &ts) < 0) + rts = ts; + copy_to_user(tsp, &rts, sizeof(rts)); } if (ret == -ERESTARTNOHAND) { @@ -1918,12 +1926,17 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct compat_timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (compat_timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only diff --git a/fs/select.c b/fs/select.c index bc60a3e14ef3..6ce68a9c8976 100644 --- a/fs/select.c +++ b/fs/select.c @@ -398,11 +398,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (timeval_compare(&rtv, &tv) < 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -460,11 +464,16 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tsp) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only @@ -758,12 +767,17 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only diff --git a/include/linux/compat.h b/include/linux/compat.h index f9ca534787e2..c9ab2a26348c 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -161,5 +161,25 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from); int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); +static inline int compat_timeval_compare(struct compat_timeval *lhs, + struct compat_timeval *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_usec - rhs->tv_usec; +} + +static inline int compat_timespec_compare(struct compat_timespec *lhs, + struct compat_timespec *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_nsec - rhs->tv_nsec; +} + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/include/linux/time.h b/include/linux/time.h index 7b4dc36532bb..d9cdba54b789 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -33,11 +33,34 @@ struct timezone { #define NSEC_PER_SEC 1000000000L #define NSEC_PER_USEC 1000L -static __inline__ int timespec_equal(struct timespec *a, struct timespec *b) +static inline int timespec_equal(struct timespec *a, struct timespec *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } +/* + * lhs < rhs: return <0 + * lhs == rhs: return 0 + * lhs > rhs: return >0 + */ +static inline int timespec_compare(struct timespec *lhs, struct timespec *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_nsec - rhs->tv_nsec; +} + +static inline int timeval_compare(struct timeval *lhs, struct timeval *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_usec - rhs->tv_usec; +} + extern unsigned long mktime(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); -- cgit v1.2.3 From bc7fc0601b3eb2254f080492f3fd69e319ed32d0 Mon Sep 17 00:00:00 2001 From: "Antonino A. Daplas" Date: Sat, 11 Feb 2006 17:56:07 -0800 Subject: [PATCH] nvidiafb: Add support for Geforce4 MX 4000 Add support for Geforce4 MX 4000 (0x185) Signed-off-by: Antonino Daplas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/nvidia/nvidia.c | 2 ++ include/linux/pci_ids.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c index dbcb8962e57d..a7c4e5e8ead6 100644 --- a/drivers/video/nvidia/nvidia.c +++ b/drivers/video/nvidia/nvidia.c @@ -138,6 +138,8 @@ static struct pci_device_id nvidiafb_pci_tbl[] = { PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420_8X, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_4000, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_448_GO, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_488_GO, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 7a61ccdcbc4b..82b83da25d77 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1087,6 +1087,7 @@ #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440_8X 0x0181 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440SE_8X 0x0182 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420_8X 0x0183 +#define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_4000 0x0185 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_448_GO 0x0186 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_488_GO 0x0187 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_580_XGL 0x0188 -- cgit v1.2.3 From 7c8903f6373f9abecf060bad53ca36bc4ac037f2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 14 Feb 2006 13:53:03 -0800 Subject: [PATCH] jbd: revert checkpoint list changes This patch reverts commit f93ea411b73594f7d144855fd34278bcf34a9afc: [PATCH] jbd: split checkpoint lists This broke journal_flush() for OCFS2, which is its method of being sure that metadata is sent to disk for another node. And two related commits 8d3c7fce2d20ecc3264c8d8c91ae3beacdeaed1b and 43c3e6f5abdf6acac9b90c86bf03f995bf7d3d92 with the subjects: [PATCH] jbd: log_do_checkpoint fix [PATCH] jbd: remove_transaction fix These seem to be incremental bugfixes on the original patch and as such are no longer needed. Signed-off-by: Mark Fasheh Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++------------------------------ fs/jbd/commit.c | 3 +- include/linux/jbd.h | 8 +- 3 files changed, 179 insertions(+), 250 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e6265a0b56b8..543ed543d1e5 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -24,75 +24,29 @@ #include /* - * Unlink a buffer from a transaction checkpoint list. + * Unlink a buffer from a transaction. * * Called with j_list_lock held. */ -static void __buffer_unlink_first(struct journal_head *jh) +static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction; transaction = jh->b_cp_transaction; + jh->b_cp_transaction = NULL; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) { + if (transaction->t_checkpoint_list == jh) transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; - } -} - -/* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ - -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ - -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; } /* * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * + * Returns 1 if we released it. * Requires j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ @@ -103,11 +57,12 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __journal_remove_checkpoint(jh) + 1; + __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); + ret = 1; } else { jbd_unlock_bh_state(bh); } @@ -162,53 +117,83 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) } /* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. + * Clean up a transaction's checkpoint list. + * + * We wait for any pending IO to complete and make sure any clean + * buffers are removed from the transaction. + * + * Return 1 if we performed any actions which might have destroyed the + * checkpoint. (journal_remove_checkpoint() deletes the transaction when + * the last checkpoint buffer is cleansed) * * Called with j_list_lock held. */ - -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) { - struct journal_head *jh; + struct journal_head *jh, *next_jh, *last_jh; struct buffer_head *bh; - tid_t this_tid; - int released = 0; - - this_tid = transaction->t_tid; -restart: - /* Didn't somebody clean up the transaction in the meanwhile */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; + int ret = 0; + + assert_spin_locked(&journal->j_list_lock); + jh = transaction->t_checkpoint_list; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; + goto out_return_1; } + /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list + * This is foul */ - released = __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - } + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + goto out_return_1; + } + + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + goto out_return_1; + } + + /* + * AKPM: I think the buffer_jbddirty test is redundant - it + * shouldn't have NULL b_transaction? + */ + next_jh = jh->b_cpnext; + if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + ret = 1; + } else { + jbd_unlock_bh_state(bh); + } + } while (jh != last_jh); + + return ret; +out_return_1: + spin_lock(&journal->j_list_lock); + return 1; } #define NR_BATCH 64 @@ -218,7 +203,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + spin_unlock(&journal->j_list_lock); ll_rw_block(SWRITE, *batch_count, bhs); + spin_lock(&journal->j_list_lock); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); @@ -234,46 +221,19 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Return 1 if something happened which requires us to abort the current * scan of the checkpoint list. * - * Called with j_list_lock held and drops it if 1 is returned + * Called with j_list_lock held. * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) +static int __flush_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count, + int *drop_count) { struct buffer_head *bh = jh2bh(jh); int ret = 0; - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - put_bh(bh); - ret = 1; - } - else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; + if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { + J_ASSERT_JH(jh, jh->b_transaction == NULL); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - ret = 1; - } - else if (!buffer_dirty(bh)) { - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - ret = 1; - } - else { /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal lock. @@ -286,30 +246,45 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); bhs[*batch_count] = bh; - __buffer_relink_io(jh); jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { - spin_unlock(&journal->j_list_lock); __flush_batch(journal, bhs, batch_count); ret = 1; } + } else { + int last_buffer = 0; + if (jh->b_cpnext == jh) { + /* We may be about to drop the transaction. Tell the + * caller that the lists have changed. + */ + last_buffer = 1; + } + if (__try_to_free_cp_buf(jh)) { + (*drop_count)++; + ret = last_buffer; + } } return ret; } /* - * Perform an actual checkpoint. We take the first transaction on the - * list of transactions to be checkpointed and send all its buffers - * to disk. We submit larger chunks of data at once. + * Perform an actual checkpoint. We don't write out only enough to + * satisfy the current blocked requests: rather we submit a reasonably + * sized chunk of the outstanding data to disk at once for + * efficiency. __log_wait_for_space() will retry if we didn't free enough. * + * However, we _do_ take into account the amount requested so that once + * the IO has been queued, we can return as soon as enough of it has + * completed to disk. + * * The journal should be locked before calling this function. */ int log_do_checkpoint(journal_t *journal) { - transaction_t *transaction; - tid_t this_tid; int result; + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; jbd_debug(1, "Start checkpoint\n"); @@ -324,70 +299,79 @@ int log_do_checkpoint(journal_t *journal) return result; /* - * OK, we need to start writing disk blocks. Take one transaction - * and write it. + * OK, we need to start writing disk blocks. Try to free up a + * quarter of the log in a single checkpoint if we can. */ - spin_lock(&journal->j_list_lock); - if (!journal->j_checkpoint_transactions) - goto out; - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; -restart: /* - * If someone cleaned up this transaction while we slept, we're - * done (maybe it's a new transaction, but it fell at the same - * address). + * AKPM: check this code. I had a feeling a while back that it + * degenerates into a busy loop at unmount time. */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; - struct journal_head *jh; - int retry = 0; - - while (!retry && transaction->t_checkpoint_list) { + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions) { + transaction_t *transaction; + struct journal_head *jh, *last_jh, *next_jh; + int drop_count = 0; + int cleanup_ret, retry = 0; + tid_t this_tid; + + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; + jh = transaction->t_checkpoint_list; + last_jh = jh->b_cpprev; + next_jh = jh; + do { struct buffer_head *bh; - jh = transaction->t_checkpoint_list; + jh = next_jh; + next_jh = jh->b_cpnext; bh = jh2bh(jh); if (!jbd_trylock_bh_state(bh)) { jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); retry = 1; break; } - retry = __process_buffer(journal, jh, bhs, - &batch_count); - if (!retry && - lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); + retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); + if (cond_resched_lock(&journal->j_list_lock)) { retry = 1; break; } - } + } while (jh != last_jh && !retry); if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } __flush_batch(journal, bhs, &batch_count); + retry = 1; } - if (retry) { - spin_lock(&journal->j_list_lock); - goto restart; - } /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one. + * If someone cleaned up this transaction while we slept, we're + * done + */ + if (journal->j_checkpoint_transactions != transaction) + break; + if (retry) + continue; + /* + * Maybe it's a new transaction, but it fell at the same + * address */ - __wait_cp_io(journal, transaction); + if (transaction->t_tid != this_tid) + continue; + /* + * We have walked the whole transaction list without + * finding anything to write to disk. We had better be + * able to make some progress or we are in trouble. + */ + cleanup_ret = __cleanup_transaction(journal, transaction); + J_ASSERT(drop_count != 0 || cleanup_ret != 0); + if (journal->j_checkpoint_transactions != transaction) + break; } -out: spin_unlock(&journal->j_list_lock); result = cleanup_journal_tail(journal); if (result < 0) return result; + return 0; } @@ -471,53 +455,6 @@ int cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ -/* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and release them. - * - * Called with the journal locked. - * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) - */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - int ret, freed = 0; - - *released = 0; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } - } - /* - * This function only frees up some memory if possible so we - * dont have an obligation to finish processing. Bail out if - * preemption requested: - */ - if (need_resched()) - return freed; - } while (jh != last_jh); - - return freed; -} - /* * journal_clean_checkpoint_list * @@ -525,38 +462,46 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns number of bufers reaped (for debug) */ int __journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0, released; + int ret = 0; transaction = journal->j_checkpoint_transactions; - if (!transaction) + if (transaction == 0) goto out; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { + struct journal_head *jh; + transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); - if (need_resched()) - goto out; - if (released) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - if (need_resched()) - goto out; + jh = transaction->t_checkpoint_list; + if (jh) { + struct journal_head *last_jh = jh->b_cpprev; + struct journal_head *next_jh = jh; + + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranknig */ + if (jbd_trylock_bh_state(jh2bh(jh))) + ret += __try_to_free_cp_buf(jh); + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + goto out; + } while (jh != last_jh); + } } while (transaction != last_transaction); out: return ret; @@ -571,22 +516,18 @@ out: * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint - * lists until they have been rewritten, at which point this function is + * list until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's - * checkpoint lists. - * - * The function returns 1 if it frees the transaction, 0 otherwise. + * checkpoint list. * * This function is called with the journal locked. * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ -int __journal_remove_checkpoint(struct journal_head *jh) +void __journal_remove_checkpoint(struct journal_head *jh) { transaction_t *transaction; journal_t *journal; - int ret = 0; JBUFFER_TRACE(jh, "entry"); @@ -597,10 +538,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal; __buffer_unlink(jh); - jh->b_cp_transaction = NULL; - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) + if (transaction->t_checkpoint_list != NULL) goto out; JBUFFER_TRACE(jh, "transaction has no more buffers"); @@ -626,10 +565,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) /* Just in case anybody was waiting for more transactions to be checkpointed... */ wake_up(&journal->j_wait_logspace); - ret = 1; out: JBUFFER_TRACE(jh, "exit"); - return ret; } /* @@ -691,7 +628,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(transaction->t_updates == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 29e62d98bae6..002ad2bbc769 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -829,8 +829,7 @@ restart_loop: journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 0fe4aa891ddc..41ee79962bb2 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -497,12 +497,6 @@ struct transaction_s */ struct journal_head *t_checkpoint_list; - /* - * Doubly-linked circular list of all buffers submitted for IO while - * checkpointing. [j_list_lock] - */ - struct journal_head *t_checkpoint_io_list; - /* * Doubly-linked circular list of temporary buffers currently undergoing * IO in the log [j_list_lock] @@ -852,7 +846,7 @@ extern void journal_commit_transaction(journal_t *); /* Checkpoint list management */ int __journal_clean_checkpoint_list(journal_t *journal); -int __journal_remove_checkpoint(struct journal_head *); +void __journal_remove_checkpoint(struct journal_head *); void __journal_insert_checkpoint(struct journal_head *, transaction_t *); /* Buffer IO */ -- cgit v1.2.3 From 5ac5f9d1ce8492163dbde5d357dc5d03becf7e36 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Feb 2006 13:53:04 -0800 Subject: [PATCH] NLM: Fix the NLM_GRANTED callback checks If 2 threads attached to the same process are blocking on different locks on different files (maybe even on different servers) but have the same lock arguments (i.e. same offset+length - actually quite common, since most processes try to lock the entire file) then the first GRANTED call that wakes one up will also wake the other. Currently when the NLM_GRANTED callback comes in, lockd walks the list of blocked locks in search of a match to the lock that the NLM server has granted. Although it checks the lock pid, start and end, it fails to check the filehandle and the server address. By checking the filehandle and server IP address, we ensure that this only happens if the locks truly are referencing the same file. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/clntlock.c | 27 +++++++++++++++++---------- fs/lockd/svc4proc.c | 2 +- fs/lockd/svcproc.c | 2 +- include/linux/lockd/lockd.h | 6 +++--- 4 files changed, 22 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 3eaf6e701087..da6354baa0b8 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -111,9 +111,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -u32 -nlmclnt_grant(struct nlm_lock *lock) +u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) { + const struct file_lock *fl = &lock->fl; + const struct nfs_fh *fh = &lock->fh; struct nlm_wait *block; u32 res = nlm_lck_denied; @@ -122,14 +123,20 @@ nlmclnt_grant(struct nlm_lock *lock) * Warning: must not use cookie to match it! */ list_for_each_entry(block, &nlm_blocked, b_list) { - if (nlm_compare_locks(block->b_lock, &lock->fl)) { - /* Alright, we found a lock. Set the return status - * and wake up the caller - */ - block->b_status = NLM_LCK_GRANTED; - wake_up(&block->b_wait); - res = nlm_granted; - } + struct file_lock *fl_blocked = block->b_lock; + + if (!nlm_compare_locks(fl_blocked, fl)) + continue; + if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + continue; + if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0) + continue; + /* Alright, we found a lock. Set the return status + * and wake up the caller + */ + block->b_status = NLM_LCK_GRANTED; + wake_up(&block->b_wait); + res = nlm_granted; } return res; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4063095d849e..b10f913aa06a 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -228,7 +228,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3bc437e0cf5b..35681d9cf1fc 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -256,7 +256,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 920766cea79c..ef21ed296039 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -149,7 +149,7 @@ struct nlm_rqst * nlmclnt_alloc_call(void); int nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl); void nlmclnt_finish_block(struct nlm_rqst *req); long nlmclnt_block(struct nlm_rqst *req, long timeout); -u32 nlmclnt_grant(struct nlm_lock *); +u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *); void nlmclnt_recovery(struct nlm_host *, u32); int nlmclnt_reclaim(struct nlm_host *, struct file_lock *); int nlmclnt_setgrantargs(struct nlm_rqst *, struct nlm_lock *); @@ -204,7 +204,7 @@ nlmsvc_file_inode(struct nlm_file *file) * Compare two host addresses (needs modifying for ipv6) */ static __inline__ int -nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2) +nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2) { return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } @@ -214,7 +214,7 @@ nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2) * When the second lock is of type F_UNLCK, this acts like a wildcard. */ static __inline__ int -nlm_compare_locks(struct file_lock *fl1, struct file_lock *fl2) +nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { return fl1->fl_pid == fl2->fl_pid && fl1->fl_start == fl2->fl_start -- cgit v1.2.3 From d6077cb80cde4506720f9165eba99ee07438513f Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Tue, 14 Feb 2006 13:53:10 -0800 Subject: [PATCH] sched: revert "filter affine wakeups" Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6: [PATCH] sched: filter affine wakeups Apparently caused more than 10% performance regression for aim7 benchmark. The setup in use is 16-cpu HP rx8620, 64Gb of memory and 12 MSA1000s with 144 disks. Each disk is 72Gb with a single ext3 filesystem (courtesy of HP, who supplied benchmark results). The problem is, for aim7, the wake-up pattern is random, but it still needs load balancing action in the wake-up path to achieve best performance. With the above commit, lack of load balancing hurts that workload. However, for workloads like database transaction processing, the requirement is exactly opposite. In the wake up path, best performance is achieved with absolutely zero load balancing. We simply wake up the process on the CPU that it was previously run. Worst performance is obtained when we do load balancing at wake up. There isn't an easy way to auto detect the workload characteristics. Ingo's earlier patch that detects idle CPU and decide whether to load balance or not doesn't perform with aim7 either since all CPUs are busy (it causes even bigger perf. regression). Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6, which causes more than 10% performance regression with aim7. Signed-off-by: Ken Chen Acked-by: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 +---- kernel/sched.c | 10 +--------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c1da0269a18..b6f51e3a38ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -697,11 +697,8 @@ struct task_struct { int lock_depth; /* BKL lock depth */ -#if defined(CONFIG_SMP) - int last_waker_cpu; /* CPU that last woke this task up */ -#if defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) int oncpu; -#endif #endif int prio, static_prio; struct list_head run_list; diff --git a/kernel/sched.c b/kernel/sched.c index 87d93be336a1..66d957227de9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1204,9 +1204,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) } } - if (p->last_waker_cpu != this_cpu) - goto out_set_cpu; - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; @@ -1277,8 +1274,6 @@ out_set_cpu: cpu = task_cpu(p); } - p->last_waker_cpu = this_cpu; - out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) { @@ -1360,12 +1355,9 @@ void fastcall sched_fork(task_t *p, int clone_flags) #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) - p->last_waker_cpu = cpu; -#if defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; #endif -#endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; -- cgit v1.2.3