From 4a17fd5229c1b6066aa478f6b690f8293ce811a1 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 19 Apr 2012 03:39:36 +0000 Subject: sock: Introduce named constants for sk_reuse Name them in a "backward compatible" manner, i.e. reuse or not are still 1 and 0 respectively. The reuse value of 2 means that the socket with it will forcibly reuse everyone else's port. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_receiver.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 43beaca53179..436f519bed1c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -664,7 +664,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) timeo = mdev->net_conf->try_connect_int * HZ; timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ - s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ + s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ s_listen->sk->sk_rcvtimeo = timeo; s_listen->sk->sk_sndtimeo = timeo; drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, @@ -841,8 +841,8 @@ retry: } } while (1); - msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ - sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ sock->sk->sk_allocation = GFP_NOIO; msock->sk->sk_allocation = GFP_NOIO; -- cgit v1.2.3 From ebd2b0cde5a4c02e2999fc3dc4a59fdd8a040fb6 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 May 2011 11:03:04 +0200 Subject: drbd: Lower log priority for an event that is definitely not an error Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 43beaca53179..e7ed0aa93a16 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -957,7 +957,7 @@ static void drbd_flush(struct drbd_conf *mdev) rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, NULL); if (rv) { - dev_err(DEV, "local disk flush failed with status %d\n", rv); + dev_info(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ -- cgit v1.2.3 From 1381e9a4969d8d24ce7b4bab048f3cb3fc9c1283 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 3 Jun 2011 21:13:17 +0200 Subject: drbd: cosmetic: fix accidental division instead of modulo when pretty printing For large resync rates, seq_printf_with_thousands_grouping() accidentally only produced Y,000,00Y, instead of the real numbers. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2959cdfb77f5..869bada2ed06 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) if (unlikely(v >= 1000000)) { /* cool: > GiByte/s */ seq_printf(seq, "%ld,", v / 1000000); - v /= 1000000; + v %= 1000000; seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); } else if (likely(v >= 1000)) seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); -- cgit v1.2.3 From 7948bcdc38b9af9ef3e72199cdea1d775a9537fc Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 6 Jun 2011 15:36:04 +0200 Subject: drbd: spelling fix: too small It is not "to small", but "too small". Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 8 ++++---- include/linux/drbd.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index abfaacaaf346..f7b5f7e86a57 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1032,7 +1032,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1046,7 +1046,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { - retcode = ERR_MD_DISK_TO_SMALL; + retcode = ERR_MD_DISK_TOO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); @@ -1057,7 +1057,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1138,7 +1138,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto force_diskless_dec; } diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 9e5f5607eba3..cb8728b28432 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -112,8 +112,8 @@ enum drbd_ret_code { ERR_OPEN_MD_DISK = 105, ERR_DISK_NOT_BDEV = 107, ERR_MD_NOT_BDEV = 108, - ERR_DISK_TO_SMALL = 111, - ERR_MD_DISK_TO_SMALL = 112, + ERR_DISK_TOO_SMALL = 111, + ERR_MD_DISK_TOO_SMALL = 112, ERR_BDCLAIM_DISK = 114, ERR_BDCLAIM_MD_DISK = 115, ERR_MD_IDX_INVALID = 116, -- cgit v1.2.3 From 5f138ce01ae6430db2e2cebd0a945dff75581d62 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 15 Jun 2011 00:59:04 +0100 Subject: DRBD: Fix comparison always false warning due to long/long long compare Fix warnings of the following nature in the drbd header: In file included from drivers/block/drbd/drbd_bitmap.c:32: drivers/block/drbd/drbd_int.h: In function 'drbd_get_syncer_progress': drivers/block/drbd/drbd_int.h:2234: warning: comparison is always false due to limited range of data where mdev->rs_total (an unsigned long) is being compared to 1ULL << 32, which is always false on a 32-bit machine. Signed-off-by: David Howells Signed-off-by: Lars Ellenberg Signed-off-by: Philipp Reisner --- drivers/block/drbd/drbd_int.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8d680562ba73..2be057b8f568 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -2230,7 +2230,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, * Note: currently we don't support such large bitmaps on 32bit * arch anyways, but no harm done to be prepared for it here. */ - unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10; + unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10; unsigned long left = *bits_left >> shift; unsigned long total = 1UL + (mdev->rs_total >> shift); unsigned long tmp = 1000UL - left * 1000UL/total; -- cgit v1.2.3 From 07667347c89e5d971243a411ca4ed83ad14f9f9e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 21 Jun 2011 01:13:37 +0200 Subject: drbd: downgraded error printk to info Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 211fc44f84be..906aca9b6659 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1530,9 +1530,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_disk_str(mdev->state.disk)); if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I am detaching my disk\n"); - else - dev_err(DEV, "Sending state for detaching disk failed\n"); + dev_info(DEV, "Notified peer that I am detaching my disk\n"); drbd_rs_cancel_all(mdev); @@ -1562,7 +1560,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, atomic_set(&mdev->rs_pending_cnt, 0); if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I'm now diskless.\n"); + dev_info(DEV, "Notified peer that I'm now diskless.\n"); /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(mdev); -- cgit v1.2.3 From 77e8fdfc188f0490754f703f80d2853c69052c12 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 29 Jun 2011 10:49:13 +0200 Subject: drbd: Only print sanitize state's warnings, if the state change happens The reason for this change is that, with when doing 'drbdadm invalidate' on a disconnected resource caused an "implicitly set pdsk from UpToDate to DUnknown" message, which was missleading. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 55 ++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 15 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 906aca9b6659..8e83a402ed5c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -509,8 +509,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort); + union drbd_state ns, enum sanitize_state_warnings *warn); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -803,6 +811,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, return rv; } +static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + dev_warn(DEV, "%s\n", msg_table[warn]); +} + /** * sanitize_state() - Resolves implicitly necessary additional changes to a state transition * @mdev: DRBD device. @@ -814,11 +837,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort) + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + if (warn) + *warn = NO_WARNING; + fp = FP_DONT_CARE; if (get_ldev(mdev)) { fp = mdev->ldev->dc.fencing; @@ -863,10 +889,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn_sync_abort) - *warn_sync_abort = - os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? - "Online-verify" : "Resync"; + if (warn) + *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; ns.conn = C_CONNECTED; } @@ -877,7 +902,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = mdev->new_state_tmp.disk; ns.pdsk = mdev->new_state_tmp.pdsk; } else { - dev_alert(DEV, "Connection lost while negotiating, no data!\n"); + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; ns.disk = D_DISKLESS; ns.pdsk = D_UNKNOWN; } @@ -959,16 +985,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = disk_max; if (ns.disk < disk_min) { - dev_warn(DEV, "Implicitly set disk from %s to %s\n", - drbd_disk_str(ns.disk), drbd_disk_str(disk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; ns.disk = disk_min; } if (ns.pdsk > pdsk_max) ns.pdsk = pdsk_max; if (ns.pdsk < pdsk_min) { - dev_warn(DEV, "Implicitly set pdsk from %s to %s\n", - drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; ns.pdsk = pdsk_min; } @@ -1045,12 +1071,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, { union drbd_state os; enum drbd_state_rv rv = SS_SUCCESS; - const char *warn_sync_abort = NULL; + enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; os = mdev->state; - ns = sanitize_state(mdev, os, ns, &warn_sync_abort); + ns = sanitize_state(mdev, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1076,8 +1102,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, return rv; } - if (warn_sync_abort) - dev_warn(DEV, "%s aborted.\n", warn_sync_abort); + print_sanitize_warnings(mdev, ssw); { char *pbp, pb[300]; -- cgit v1.2.3 From 6809384c7152c34e74e29a4033826a300eb94f11 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 30 Jun 2011 15:43:06 +0200 Subject: drbd: Improve compatibility with drbd's older than 8.3.7 Regression introduced with 8.3.11 commit: drbd: Take a more conservative approach when deciding max_bio_size Never ever tell an older drbd, that we support more than 32KiB in a single data request (packet). Never believe an older drbd, that is supports more than 32KiB in a single data request (packet) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 4 ++++ drivers/block/drbd/drbd_nl.c | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8e83a402ed5c..a15135b9b60a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2112,6 +2112,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } + /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ + if (mdev->agreed_pro_version <= 94) + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index f7b5f7e86a57..cabad39f908c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->agreed_pro_version < 94) - peer = mdev->peer_max_bio_size; - else if (mdev->agreed_pro_version == 94) + if (mdev->agreed_pro_version < 94) { + peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ peer = DRBD_MAX_BIO_SIZE; -- cgit v1.2.3 From 6d7e32f56806ad58006720ed98a433b2047444da Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 15 Mar 2011 10:25:18 +0100 Subject: drbd: Keep a reference to barrier acked requests Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 1 + drivers/block/drbd/drbd_main.c | 23 +++++++++++++++++++++-- drivers/block/drbd/drbd_req.h | 1 + 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 2be057b8f568..5bb5114dd23c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1022,6 +1022,7 @@ struct drbd_conf { struct drbd_tl_epoch *newest_tle; struct drbd_tl_epoch *oldest_tle; struct list_head out_of_sequence_requests; + struct list_head barrier_acked_requests; struct hlist_head *tl_hash; unsigned int tl_hash_s; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a15135b9b60a..8e489a6d022e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -208,6 +208,7 @@ static int tl_init(struct drbd_conf *mdev) mdev->oldest_tle = b; mdev->newest_tle = b; INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + INIT_LIST_HEAD(&mdev->barrier_acked_requests); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; @@ -311,7 +312,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, These have been list_move'd to the out_of_sequence_requests list in _req_mod(, barrier_acked) above. */ - list_del_init(&b->requests); + list_splice_init(&b->requests, &mdev->barrier_acked_requests); nob = b->next; if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { @@ -343,7 +344,7 @@ bail: * @what: The action/event to perform with all request objects * * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, - * restart_frozen_disk_io. + * restart_frozen_disk_io, abort_disk_io. */ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) { @@ -411,6 +412,24 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) b = tmp; list_splice(&carry_reads, &b->requests); } + + /* Actions operating on the disk state, also want to work on + requests that got barrier acked. */ + switch (what) { + case abort_disk_io: + case fail_frozen_disk_io: + case restart_frozen_disk_io: + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); + } + + case connection_lost_while_pending: + case resend: + break; + default: + dev_err(DEV, "what = %d in _tl_restart()\n", what); + } } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 68a234a5fdc5..74c5b9f14d61 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -105,6 +105,7 @@ enum drbd_req_event { read_completed_with_error, read_ahead_completed_with_error, write_completed_with_error, + abort_disk_io, completed_ok, resend, fail_frozen_disk_io, -- cgit v1.2.3 From 2b4dd36fbae7203a0d503a6cede1f4ce17aa72ac Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 14 Mar 2011 13:01:50 +0100 Subject: drbd: Immediately allow completion of IOs, that wait for IO completions on a failed disk Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 10 ++++++++++ drivers/block/drbd/drbd_req.c | 20 ++++++++++++++++---- drivers/block/drbd/drbd_req.h | 18 +++++++++++------- 3 files changed, 37 insertions(+), 11 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8e489a6d022e..16969c2b96cd 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -368,6 +368,12 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } tmp = b->next; + if (what == abort_disk_io) { + /* Only walk the TL, leave barrier objects in place */ + b = tmp; + continue; + } + if (n_writes) { if (what == resend) { b->n_writes = n_writes; @@ -1565,6 +1571,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, eh = mdev->ldev->dc.on_io_error; was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + /* Immediately allow completion of all application IO, that waits + for completion from the local disk. */ + tl_restart(mdev, abort_disk_io); + /* current state still has to be D_FAILED, * there is only one way out: to D_DISKLESS, * and that may only happen after our put_ldev below. */ diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4a0f314086e5..1a8aac4b0c2f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -214,8 +214,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { const unsigned long s = req->rq_state; struct drbd_conf *mdev = req->mdev; - /* only WRITES may end up here without a master bio (on barrier ack) */ - int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; + int rw = req->rq_state & RQ_WRITE ? WRITE : READ; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -230,7 +229,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_NET_PENDING) return; - if (s & RQ_LOCAL_PENDING) + if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) return; if (req->master_bio) { @@ -277,6 +276,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) req->master_bio = NULL; } + if (s & RQ_LOCAL_PENDING) + return; + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { /* this is disconnected (local only) operation, * or protocol C P_WRITE_ACK, @@ -429,7 +431,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case completed_ok: - if (bio_data_dir(req->master_bio) == WRITE) + if (req->rq_state & RQ_WRITE) mdev->writ_cnt += req->size>>9; else mdev->read_cnt += req->size>>9; @@ -441,6 +443,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, put_ldev(mdev); break; + case abort_disk_io: + req->rq_state |= RQ_LOCAL_ABORTED; + if (req->rq_state & RQ_WRITE) + _req_may_be_done_not_susp(req, m); + else + goto goto_queue_for_net_read; + break; + case write_completed_with_error: req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; @@ -469,6 +479,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(mdev, false); put_ldev(mdev); + goto_queue_for_net_read: + /* no point in retrying if there is no good remote data, * or we have no connection. */ if (mdev->state.pdsk != D_UP_TO_DATE) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 74c5b9f14d61..3d2111919486 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -119,18 +119,21 @@ enum drbd_req_event { * same time, so we should hold the request lock anyways. */ enum drbd_req_state_bits { - /* 210 - * 000: no local possible - * 001: to be submitted + /* 3210 + * 0000: no local possible + * 0001: to be submitted * UNUSED, we could map: 011: submitted, completion still pending - * 110: completed ok - * 010: completed with error + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free */ __RQ_LOCAL_PENDING, __RQ_LOCAL_COMPLETED, __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, - /* 76543 + /* 87654 * 00000: no network possible * 00001: to be send * 00011: to be send, on worker queue @@ -200,8 +203,9 @@ enum drbd_req_state_bits { #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) -#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) -- cgit v1.2.3 From cc94c65015022e7329e80e057e20848581d3f2a5 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Sun, 26 Jun 2011 11:20:27 +0200 Subject: drbd: moved md_io into mdev Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 12 +++++------- drivers/block/drbd/drbd_int.h | 4 ++-- drivers/block/drbd/drbd_worker.c | 3 +++ 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index cf0e63dd97da..c64ca6b956cf 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -71,12 +71,10 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, int rw, int size) { struct bio *bio; - struct drbd_md_io md_io; int ok; - md_io.mdev = mdev; - init_completion(&md_io.event); - md_io.error = 0; + init_completion(&mdev->md_io.event); + mdev->md_io.error = 0; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA | REQ_FLUSH; @@ -88,7 +86,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, ok = (bio_add_page(bio, page, size, 0) == size); if (!ok) goto out; - bio->bi_private = &md_io; + bio->bi_private = &mdev->md_io; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; @@ -96,8 +94,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_for_completion(&md_io.event); - ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + wait_for_completion(&mdev->md_io.event); + ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: bio_put(bio); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 5bb5114dd23c..91e69ffd5566 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -938,7 +938,6 @@ struct drbd_backing_dev { }; struct drbd_md_io { - struct drbd_conf *mdev; struct completion event; int error; }; @@ -1095,7 +1094,8 @@ struct drbd_conf { wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ - struct mutex md_io_mutex; /* protects the md_io_buffer */ + struct drbd_md_io md_io; + struct mutex md_io_mutex; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4d3e6f6213ba..eedfe311c403 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -70,8 +70,11 @@ rwlock_t global_state_lock; void drbd_md_io_complete(struct bio *bio, int error) { struct drbd_md_io *md_io; + struct drbd_conf *mdev; md_io = (struct drbd_md_io *)bio->bi_private; + mdev = container_of(md_io, struct drbd_conf, md_io); + md_io->error = error; complete(&md_io->event); -- cgit v1.2.3 From e17117310b73ce6d2340ad46a539d3896a2d6de8 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 27 Jun 2011 11:51:46 +0200 Subject: drbd: Replaced md_io_mutex by an atomic: md_io_in_use The new function drbd_md_get_buffer() aborts waiting for the buffer in case the disk failes in the meantime. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 44 +++++++++++++++++++++++++++++++--------- drivers/block/drbd/drbd_int.h | 6 ++++-- drivers/block/drbd/drbd_main.c | 19 ++++++++++------- drivers/block/drbd/drbd_worker.c | 1 + 4 files changed, 51 insertions(+), 19 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index c64ca6b956cf..4271352dd72b 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -65,6 +65,23 @@ struct drbd_atodb_wait { int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); +void *drbd_md_get_buffer(struct drbd_conf *mdev) +{ + int r; + + wait_event(mdev->misc_wait, + (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || + mdev->state.disk <= D_FAILED); + + return r ? NULL : page_address(mdev->md_io_page); +} + +void drbd_md_put_buffer(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->md_io_in_use)) + wake_up(&mdev->misc_wait); +} + static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, @@ -90,6 +107,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; + atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else @@ -109,7 +127,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int offset = 0; struct page *iop = mdev->md_io_page; - D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); BUG_ON(!bdev->md_bdev); @@ -326,8 +344,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } - mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = (struct al_transaction *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ + if (!buffer) { + dev_err(DEV, "disk failed while waiting for md_io buffer\n"); + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + return 1; + } buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); buffer->tr_number = cpu_to_be32(mdev->al_tr_number); @@ -372,7 +395,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); mdev->al_tr_number++; - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); complete(&((struct update_al_work *)w)->event); put_ldev(mdev); @@ -441,8 +464,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) /* lock out all other meta data io for now, * and make sure the page is mapped. */ - mutex_lock(&mdev->md_io_mutex); - buffer = page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + return 0; /* Find the valid transaction in the log */ for (i = 0; i <= mx; i++) { @@ -450,7 +474,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (rv == 0) continue; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } cnr = be32_to_cpu(buffer->tr_number); @@ -476,7 +500,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!found_valid) { dev_warn(DEV, "No usable activity log found.\n"); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 1; } @@ -491,7 +515,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = drbd_al_read_tr(mdev, bdev, buffer, i); ERR_IF(rv == 0) goto cancel; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } @@ -532,7 +556,7 @@ cancel: mdev->al_tr_pos = 0; /* ok, we are done with it */ - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", transactions, active_extents); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 91e69ffd5566..55cae74911e8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1095,7 +1095,7 @@ struct drbd_conf { struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ struct drbd_md_io md_io; - struct mutex md_io_mutex; /* protects the md_io, md_io_page and md_io_tmpp */ + atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ @@ -1537,8 +1537,10 @@ extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); extern int drbd_resync_finished(struct drbd_conf *mdev); /* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_conf *mdev); +extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, sector_t sector, int rw); + struct drbd_backing_dev *bdev, sector_t sector, int rw); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 16969c2b96cd..1b59ab3ab9c7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3043,8 +3043,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); atomic_set(&mdev->ap_in_flight, 0); + atomic_set(&mdev->md_io_in_use, 0); - mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); mutex_init(&mdev->meta.mutex); sema_init(&mdev->data.work.s, 0); @@ -3722,8 +3722,10 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!get_ldev_if_state(mdev, D_FAILED)) return; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; + memset(buffer, 0, 512); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); @@ -3754,7 +3756,8 @@ void drbd_md_sync(struct drbd_conf *mdev) * since we updated it on metadata. */ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); +out: put_ldev(mdev); } @@ -3774,8 +3777,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3836,7 +3840,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->sync_conf.al_extents = 127; err: - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); + out: put_ldev(mdev); return rv; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index eedfe311c403..94a128b49f4f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -78,6 +78,7 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io->error = error; complete(&md_io->event); + drbd_md_put_buffer(mdev); } /* reads on behalf of the partner, -- cgit v1.2.3 From 0c464425158482647226fb30708c68fffc061585 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Sun, 26 Jun 2011 22:26:31 +0200 Subject: drbd: Implemented wait_until_done_or_disk_failure() Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 17 ++++++++++++++--- drivers/block/drbd/drbd_int.h | 3 ++- drivers/block/drbd/drbd_worker.c | 3 ++- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 4271352dd72b..aee599fad960 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -82,6 +82,17 @@ void drbd_md_put_buffer(struct drbd_conf *mdev) wake_up(&mdev->misc_wait); } +static bool md_io_allowed(struct drbd_conf *mdev) +{ + enum drbd_disk_state ds = mdev->state.disk; + return ds >= D_NEGOTIATING || ds == D_ATTACHING; +} + +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, unsigned int *done) +{ + wait_event(mdev->misc_wait, *done || !md_io_allowed(mdev)); +} + static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, @@ -90,8 +101,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct bio *bio; int ok; - init_completion(&mdev->md_io.event); - mdev->md_io.error = 0; + mdev->md_io.done = 0; + mdev->md_io.error = -ENODEV; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA | REQ_FLUSH; @@ -112,7 +123,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_for_completion(&mdev->md_io.event); + wait_until_done_or_disk_failure(mdev, &mdev->md_io.done); ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 55cae74911e8..10ea9e388246 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -938,7 +938,7 @@ struct drbd_backing_dev { }; struct drbd_md_io { - struct completion event; + unsigned int done; int error; }; @@ -1541,6 +1541,7 @@ extern void *drbd_md_get_buffer(struct drbd_conf *mdev); extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, unsigned int *done); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 94a128b49f4f..ee8303680dd8 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -77,7 +77,8 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io->error = error; - complete(&md_io->event); + md_io->done = 1; + wake_up(&mdev->misc_wait); drbd_md_put_buffer(mdev); } -- cgit v1.2.3 From 4a2fe568b5428abc56d7d172e3571e33d8ab7265 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 4 Jul 2011 11:14:31 +0200 Subject: drbd: Keep a reference to the bio until the completion handler finished Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 1 + drivers/block/drbd/drbd_worker.c | 1 + 2 files changed, 2 insertions(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index aee599fad960..528342c82ba1 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -118,6 +118,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; + bio_get(bio); /* one bio_put() is in the completion handler */ atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index ee8303680dd8..6dcd9f6e78c8 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -79,6 +79,7 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io->done = 1; wake_up(&mdev->misc_wait); + bio_put(bio); drbd_md_put_buffer(mdev); } -- cgit v1.2.3 From b2057629ea96c33e4ae38102ecd0f27ed9a3c3ef Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 27 Jun 2011 12:23:33 +0200 Subject: drbd: Hold a reference to ldev while doing meta-data IO Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 6 ++++++ drivers/block/drbd/drbd_worker.c | 1 + 2 files changed, 7 insertions(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 528342c82ba1..3d7c2153daca 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -118,6 +118,12 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + ok = 0; + goto out; + } + bio_get(bio); /* one bio_put() is in the completion handler */ atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 6dcd9f6e78c8..933091ffefca 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -81,6 +81,7 @@ void drbd_md_io_complete(struct bio *bio, int error) wake_up(&mdev->misc_wait); bio_put(bio); drbd_md_put_buffer(mdev); + put_ldev(mdev); } /* reads on behalf of the partner, -- cgit v1.2.3 From d1f3779bbeae70b9552c9ac70d6ec8c4d3545615 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 27 Jun 2011 14:56:52 +0200 Subject: drbd: Added a kref to bm_aio_ctx Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 84 ++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 25 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 3030201c69d8..a2c337b38d6e 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -890,8 +890,16 @@ struct bm_aio_ctx { unsigned flags; #define BM_AIO_COPY_PAGES 1 int error; + struct kref kref; }; +static void bm_aio_ctx_destroy(struct kref *kref) +{ + struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + + kfree(ctx); +} + /* bv_page may be a copy, or may be the original */ static void bm_async_io_complete(struct bio *bio, int error) { @@ -936,8 +944,10 @@ static void bm_async_io_complete(struct bio *bio, int error) bio_put(bio); - if (atomic_dec_and_test(&ctx->in_flight)) + if (atomic_dec_and_test(&ctx->in_flight)) { complete(&ctx->done); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + } } static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) @@ -1001,12 +1011,7 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must */ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) { - struct bm_aio_ctx ctx = { - .mdev = mdev, - .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), - .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, - }; + struct bm_aio_ctx *ctx; struct drbd_bitmap *b = mdev->bitmap; int num_pages, i, count = 0; unsigned long now; @@ -1021,7 +1026,21 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id * For lazy writeout, we don't care for ongoing changes to the bitmap, * as we submit copies of pages anyways. */ - if (!ctx.flags) + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = COMPLETION_INITIALIZER(ctx->done), + .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!ctx->flags) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); num_pages = b->bm_number_of_pages; @@ -1046,27 +1065,28 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id continue; } } - atomic_inc(&ctx.in_flight); - bm_page_io_async(&ctx, i, rw); + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i, rw); ++count; cond_resched(); } /* - * We initialize ctx.in_flight to one to make sure bm_async_io_complete + * We initialize ctx->in_flight to one to make sure bm_async_io_complete * will not complete() early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. */ - if (!atomic_dec_and_test(&ctx.in_flight)) - wait_for_completion(&ctx.done); + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_for_completion(&ctx->done); + dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", rw == WRITE ? "WRITE" : "READ", count, jiffies - now); - if (ctx.error) { + if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); drbd_chk_io_error(mdev, 1, true); - err = -EIO; /* ctx.error ? */ + err = -EIO; /* ctx->error ? */ } now = jiffies; @@ -1082,6 +1102,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + return err; } @@ -1130,28 +1152,40 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l */ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) { - struct bm_aio_ctx ctx = { - .mdev = mdev, - .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), - .flags = BM_AIO_COPY_PAGES, - }; + struct bm_aio_ctx *ctx; + int err; if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); return 0; } - bm_page_io_async(&ctx, idx, WRITE_SYNC); - wait_for_completion(&ctx.done); + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = COMPLETION_INITIALIZER(ctx->done), + .flags = BM_AIO_COPY_PAGES, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + bm_page_io_async(ctx, idx, WRITE_SYNC); + wait_for_completion(&ctx->done); - if (ctx.error) + if (ctx->error) drbd_chk_io_error(mdev, 1, true); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ mdev->bm_writ_cnt++; - return ctx.error; + err = ctx->error; + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + + return err; } /* NOTE -- cgit v1.2.3 From 9e58c4dad70bfba86f016bcc98fa19f468d0b777 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 27 Jun 2011 15:29:16 +0200 Subject: drbd: Bitmap IO functions can now return prematurely if the disk breaks Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index a2c337b38d6e..e5e756dbc434 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -886,7 +886,7 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) struct bm_aio_ctx { struct drbd_conf *mdev; atomic_t in_flight; - struct completion done; + unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 int error; @@ -897,6 +897,7 @@ static void bm_aio_ctx_destroy(struct kref *kref) { struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + put_ldev(ctx->mdev); kfree(ctx); } @@ -945,7 +946,8 @@ static void bm_async_io_complete(struct bio *bio, int error) bio_put(bio); if (atomic_dec_and_test(&ctx->in_flight)) { - complete(&ctx->done); + ctx->done = 1; + wake_up(&mdev->misc_wait); kref_put(&ctx->kref, &bm_aio_ctx_destroy); } } @@ -1034,12 +1036,18 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id *ctx = (struct bm_aio_ctx) { .mdev = mdev, .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER(ctx->done), + .done = 0, .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, .error = 0, .kref = { ATOMIC_INIT(2) }, }; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + kfree(ctx); + return -ENODEV; + } + if (!ctx->flags) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); @@ -1073,11 +1081,16 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id /* * We initialize ctx->in_flight to one to make sure bm_async_io_complete - * will not complete() early, and decrement / test it here. If there + * will not set ctx->done early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. */ if (!atomic_dec_and_test(&ctx->in_flight)) - wait_for_completion(&ctx->done); + wait_until_done_or_disk_failure(mdev, &ctx->done); + else + kref_put(&ctx->kref, &bm_aio_ctx_destroy); dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", rw == WRITE ? "WRITE" : "READ", @@ -1089,6 +1102,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id err = -EIO; /* ctx->error ? */ } + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk failed during IO... */ + now = jiffies; if (rw == WRITE) { drbd_md_flush(mdev); @@ -1103,7 +1119,6 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); kref_put(&ctx->kref, &bm_aio_ctx_destroy); - return err; } @@ -1167,14 +1182,20 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc *ctx = (struct bm_aio_ctx) { .mdev = mdev, .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER(ctx->done), + .done = 0, .flags = BM_AIO_COPY_PAGES, .error = 0, .kref = { ATOMIC_INIT(2) }, }; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); + kfree(ctx); + return -ENODEV; + } + bm_page_io_async(ctx, idx, WRITE_SYNC); - wait_for_completion(&ctx->done); + wait_until_done_or_disk_failure(mdev, &ctx->done); if (ctx->error) drbd_chk_io_error(mdev, 1, true); @@ -1182,9 +1203,8 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc * gone in a moment as well. */ mdev->bm_writ_cnt++; - err = ctx->error; + err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; kref_put(&ctx->kref, &bm_aio_ctx_destroy); - return err; } -- cgit v1.2.3 From 5ca1de0384dafe843de10fed843de26de740bca1 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 28 Jun 2011 17:01:19 +0200 Subject: drbd: Allow new IOs while the local disk in in FAILED state The last bunch of commits prepared the 'detach from tar pit' feature. With that we can be for long time in disk state FAILED. We need to accept new IO requests during that time. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 10ea9e388246..c7976a77dfba 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -2310,12 +2310,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) case D_OUTDATED: case D_CONSISTENT: case D_UP_TO_DATE: + case D_FAILED: /* disk state is stable as well. */ break; /* no new io accepted during tansitional states */ case D_ATTACHING: - case D_FAILED: case D_NEGOTIATING: case D_UNKNOWN: case D_MASK: -- cgit v1.2.3 From 02ee8f95fadf7c94b3d28df436a095152f6392b2 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 14 Mar 2011 11:54:47 +0100 Subject: drbd: Force flag for the detach operation Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_nl.c | 15 +++++++++++++++ include/linux/drbd_nl.h | 4 +++- 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1b59ab3ab9c7..bc8a8a7556da 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -495,7 +495,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev, ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index cabad39f908c..6d116a2b2321 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1337,17 +1337,32 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, { enum drbd_ret_code retcode; int ret; + struct detach dt = {}; + + if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { + reply->ret_code = ERR_MANDATORY_TAG; + goto out; + } + + if (dt.detach_force) { + drbd_force_state(mdev, NS(disk, D_FAILED)); + reply->ret_code = SS_SUCCESS; + goto out; + } + drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); /* D_FAILED will transition to DISKLESS. */ ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); drbd_resume_io(mdev); + if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; reply->ret_code = retcode; +out: return 0; } diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index ab6159e4fcf0..7203c9ead233 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -33,7 +33,9 @@ NL_PACKET(disk_conf, 3, NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) ) -NL_PACKET(detach, 4, ) +NL_PACKET(detach, 4, + NL_BIT( 88, T_MANDATORY, detach_force) +) NL_PACKET(net_conf, 5, NL_STRING( 8, T_MANDATORY, my_addr, 128) -- cgit v1.2.3 From dfa8bedbfe881caf6676704ab1aae18dfe8e430a Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 29 Jun 2011 14:06:08 +0200 Subject: drbd: Implemented the disk-timeout option When the disk-timeout is active, and it expires for a single request, we consider the local disk as D_FAILED. Note: With this change, I made both timeout based state transitions HARD state transitions. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 5 +++++ drivers/block/drbd/drbd_receiver.c | 2 -- drivers/block/drbd/drbd_req.c | 32 ++++++++++++++++++++------------ include/linux/drbd_limits.h | 5 +++++ include/linux/drbd_nl.h | 1 + 5 files changed, 31 insertions(+), 14 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index bc8a8a7556da..4bd636524dd1 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1404,6 +1404,9 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ + if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) + mod_timer(&mdev->request_timer, jiffies + HZ); + nsm.i = -1; if (ns.susp_nod) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) @@ -3318,6 +3321,8 @@ static void drbd_delete_device(unsigned int minor) if (!mdev) return; + del_timer_sync(&mdev->request_timer); + /* paranoia asserts */ if (mdev->open_cnt != 0) dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e7ed0aa93a16..a85bbe1bbc2b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3803,8 +3803,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); - del_timer(&mdev->request_timer); - /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); resync_timer_fn((unsigned long)mdev); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 1a8aac4b0c2f..ef145e33a647 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1208,13 +1208,19 @@ void request_timer_fn(unsigned long data) struct drbd_conf *mdev = (struct drbd_conf *) data; struct drbd_request *req; /* oldest request */ struct list_head *le; - unsigned long et = 0; /* effective timeout = ko_count * timeout */ + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ if (get_net_conf(mdev)) { - et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; + ent = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; put_net_conf(mdev); } - if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) + if (get_ldev(mdev)) { + dt = mdev->ldev->dc.disk_timeout * HZ / 10; + put_ldev(mdev); + } + et = min_not_zero(dt, ent); + + if (!et || (mdev->state.conn < C_WF_REPORT_PARAMS && mdev->state.disk <= D_FAILED)) return; /* Recurring timer stopped */ spin_lock_irq(&mdev->req_lock); @@ -1227,17 +1233,19 @@ void request_timer_fn(unsigned long data) le = le->prev; req = list_entry(le, struct drbd_request, tl_requests); - if (time_is_before_eq_jiffies(req->start_time + et)) { - if (req->rq_state & RQ_NET_PENDING) { + if (ent && req->rq_state & RQ_NET_PENDING) { + if (time_is_before_eq_jiffies(req->start_time + ent)) { dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL); - } else { - dev_warn(DEV, "Local backing block device frozen?\n"); - mod_timer(&mdev->request_timer, jiffies + et); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - } else { - mod_timer(&mdev->request_timer, req->start_time + et); } - + if (dt && req->rq_state & RQ_LOCAL_PENDING) { + if (time_is_before_eq_jiffies(req->start_time + dt)) { + dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(mdev, 1); + } + } + nt = (time_is_before_eq_jiffies(req->start_time + et) ? jiffies : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); + mod_timer(&mdev->request_timer, nt); } diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 928c84dfaf42..fb670bf603f7 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -48,6 +48,11 @@ #define DRBD_TIMEOUT_MAX 600 #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ + /* If backing disk takes longer than disk_timeout, mark the disk as failed */ +#define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ +#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ +#define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ + /* active connection retries when C_WF_CONNECTION */ #define DRBD_CONNECT_INT_MIN 1 #define DRBD_CONNECT_INT_MAX 120 diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index 7203c9ead233..a8706f08ab36 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -31,6 +31,7 @@ NL_PACKET(disk_conf, 3, NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) + NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout) ) NL_PACKET(detach, 4, -- cgit v1.2.3 From 22f46ce2ef94151f806f84ab0f9ee43a72dfb1f1 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 11 Jul 2011 17:32:26 +0200 Subject: drbd: change some GFP_KERNEL to GFP_NOIO Bitmap IO may happend in the context of an application write, in the generic block IO path. We need to use GFP_NOIO. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index e5e756dbc434..9611db43cc7a 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -955,7 +955,7 @@ static void bm_async_io_complete(struct bio *bio, int error) static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { /* we are process context. we always get a bio */ - struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct bio *bio = bio_alloc(GFP_NOIO, 1); struct drbd_conf *mdev = ctx->mdev; struct drbd_bitmap *b = mdev->bitmap; struct page *page; @@ -1029,7 +1029,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id * as we submit copies of pages anyways. */ - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_KERNEL); + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); if (!ctx) return -ENOMEM; @@ -1175,7 +1175,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc return 0; } - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_KERNEL); + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); if (!ctx) return -ENOMEM; -- cgit v1.2.3 From bca482e90bf87569ffb87cba2c3a777ac23e5c86 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 15 Jul 2011 12:14:27 +0200 Subject: drbd: Fixed current UUID generation Now, the new edition of the clause only fires if a diskless peer gets promoted. This is a fixup for "drbd: Delayed creation of current-UUID". Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 4bd636524dd1..b62e3559b50c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1494,11 +1494,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(mdev); drbd_send_uuids(mdev); } - /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) /* We may still be Primary ourselves. -- cgit v1.2.3 From 79f16f5dbc95da372c25afddac80f4adef3cfce1 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 15 Jul 2011 18:44:26 +0200 Subject: drbd: Consider that the no-data-condition could be in connected state ...when the peer has inconsistent data. In that case we failed to clear the susp_nod flag. When the local disk was attached again Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index b62e3559b50c..0ee05315f9ac 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1412,7 +1412,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) what = resend; - if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) what = restart_frozen_disk_io; if (what != nothing) -- cgit v1.2.3 From fd2491f4a4a403b376f71a336a36848158efb0fe Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 18 Jul 2011 16:25:15 +0200 Subject: drbd: detach must not try to abort non-local requests from drbd-8.4 Cherry picked form 8.4 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 43 +++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 0ee05315f9ac..56569a803331 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -344,7 +344,7 @@ bail: * @what: The action/event to perform with all request objects * * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, - * restart_frozen_disk_io, abort_disk_io. + * restart_frozen_disk_io. */ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) { @@ -368,12 +368,6 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } tmp = b->next; - if (what == abort_disk_io) { - /* Only walk the TL, leave barrier objects in place */ - b = tmp; - continue; - } - if (n_writes) { if (what == resend) { b->n_writes = n_writes; @@ -422,7 +416,6 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) /* Actions operating on the disk state, also want to work on requests that got barrier acked. */ switch (what) { - case abort_disk_io: case fail_frozen_disk_io: case restart_frozen_disk_io: list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { @@ -482,6 +475,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) spin_unlock_irq(&mdev->req_lock); } +/** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL + * @mdev: DRBD device. + */ +void tl_abort_disk_io(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + struct list_head *le, *tle; + struct drbd_request *req; + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + b = b->next; + } + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + + spin_unlock_irq(&mdev->req_lock); +} + /** * cl_wide_st_chg() - true if the state change is a cluster wide one * @mdev: DRBD device. @@ -1577,7 +1602,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Immediately allow completion of all application IO, that waits for completion from the local disk. */ - tl_restart(mdev, abort_disk_io); + tl_abort_disk_io(mdev); /* current state still has to be D_FAILED, * there is only one way out: to D_DISKLESS, -- cgit v1.2.3 From 80f9fd55a66a6843373330901564ef2d9c7fb050 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 18 Jul 2011 15:45:15 +0200 Subject: drbd: Cleanup all epoch objects upon connection loss Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a85bbe1bbc2b..bb92671d1f16 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1001,13 +1001,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { if (!(ev & EV_CLEANUP)) { spin_unlock(&mdev->epoch_lock); drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); spin_lock(&mdev->epoch_lock); } - dec_unacked(mdev); + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(mdev); if (mdev->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); -- cgit v1.2.3 From 1e86ac48af137a3cfd48cba727e7abe132dfc8de Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 4 Aug 2011 10:33:08 +0200 Subject: drbd: Bugfix for the connection behavior If we get into the C_BROKEN_PIPE cstate once, the state engine set the thi->t_state of the receiver thread to restarting. But with the while loop in drbdd_init() a new connection gets established. After the call into drbdd() returns immediately since the thi->t_state is not RUNNING. The restart of drbd_init() then resets thi->t_state to RUNNING. I.e. after entering C_BROKEN_PIPE once, the next successful established connection gets wasted. The two parts of the fix: * Do not cause the thread to restart if we detect the issue with the sockets while we are in C_WF_CONNECTION. * Make sure that all actions that would have set us to C_BROKEN_PIPE happen before the state change to C_WF_REPORT_PARAMS. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_receiver.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 56569a803331..8658dac2853a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1318,7 +1318,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, drbd_thread_stop_nowait(&mdev->receiver); /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_TEAR_DOWN && + if (os.conn > C_WF_CONNECTION && ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index bb92671d1f16..aa6c52c80e2e 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -888,17 +888,12 @@ retry: } } - if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) - return 0; - sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&mdev->packet_seq, 0); mdev->peer_seq = 0; - drbd_thread_start(&mdev->asender); - if (drbd_send_protocol(mdev) == -1) return -1; drbd_send_sync_param(mdev, &mdev->sync_conf); @@ -907,6 +902,11 @@ retry: drbd_send_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); + + if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) + return 0; + + drbd_thread_start(&mdev->asender); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return 1; -- cgit v1.2.3 From 40424e4a24bc500639cb4bf1bf846362b0e692a5 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 26 Sep 2011 15:24:56 +0200 Subject: drbd: fix "stalled" empty resync With sync-after dependencies, given "lucky" timing of pause/unpause events, and the end of an empty (0 bits set) resync was sometimes not detected on the SyncTarget, leading to a "stalled" SyncSource state. Fixed this by expecting not only "Inconsistent -> UpToDate" but also "Consistent -> UpToDate" transitions for the peer disk state to end a resync. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index aa6c52c80e2e..1a48e02b83bc 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3169,9 +3169,14 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - /* peer says his disk is uptodate, while we think it is inconsistent, - * and this happens while we think we have a sync going on. */ - if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { /* If we are (becoming) SyncSource, but peer is still in sync * preparation, ignore its uptodate-ness to avoid flapping, it -- cgit v1.2.3 From 7b4e4d31268cbd885782bf3eee8027c852c9d191 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 28 Sep 2011 22:15:04 +0200 Subject: drbd: drbd_nl_resize(): Fix missing put_ldev() on error path Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6d116a2b2321..1a81d9ed1b24 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1727,7 +1727,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, if (rs.no_resync && mdev->agreed_pro_version < 93) { retcode = ERR_NEED_APV_93; - goto fail; + goto fail_ldev; } if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) @@ -1754,6 +1754,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, fail: reply->ret_code = retcode; return 0; + + fail_ldev: + put_ldev(mdev); + goto fail; } static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, -- cgit v1.2.3 From 5ba3dac52126699e541ac3ee37aad890ca835fc1 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 5 Oct 2011 15:54:18 +0200 Subject: drbd: Derive sync-UUIDs only from the bitmap-uuid if it is non-zero Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8658dac2853a..64318d4ca9ec 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2138,7 +2138,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET; + uuid = mdev->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); drbd_uuid_set(mdev, UI_BITMAP, uuid); drbd_print_uuids(mdev, "updated sync UUID"); drbd_md_sync(mdev); -- cgit v1.2.3 From 6a9a92f4ef05bb3e94bbfe123c21482fa5da9866 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 6 Oct 2011 17:10:34 +0200 Subject: drbd: fix harmless race to not trigger an ASSERT We have one pre-allocated page to do certain synchronous meta data IO with, using it is serialized like so: drbd_md_get_buffer(); drbd_md_sync_page_io(); drbd_md_sync_page_io(); ... drbd_md_put_buffer(); In drbd_md_sync_page_io() there is an ASSERT(atomic_read(&mdev->md_io_in_use) == 1); We want to be able to timeout on unresponsive lower level devices, so we can "detach" in that case. Inside drbd_md_sync_page_io() we grab an extra reference, to not have a dangling pointer in case a delayed IO eventually does still complete, even after we "detached" already. We need to put the extra reference before we signal completion from the completion handler, or the second drbd_md_sync_page_io() above may trigger the assert (reference count still 2). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 933091ffefca..62bde5ae17f7 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -77,10 +77,21 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io->error = error; + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(mdev); md_io->done = 1; wake_up(&mdev->misc_wait); bio_put(bio); - drbd_md_put_buffer(mdev); put_ldev(mdev); } -- cgit v1.2.3 From aaae506d545bb9d06f4d8362f670f406f12e4b58 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 6 Oct 2011 18:29:14 +0200 Subject: drbd: Fixed a race condition between detach and start of resync drbd_state_lock() is only there to serialize cluster wide state changes. Testing the local disk state needs to happen while holding the global_state_lock. Otherwise you might see something like this (Oct 6 on kugel) 14:20:24 drbd0: conn( WFSyncUUID -> Connected ) disk( Inconsistent -> Failed ) 14:20:24 drbd0: helper command: /sbin/drbdadm before-resync-target minor-0 exit code 0 (0x0) 14:20:24 drbd0: conn( Connected -> SyncTarget ) disk( Failed -> Inconsistent ) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 62bde5ae17f7..5fc60f622bed 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1537,14 +1537,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } drbd_state_lock(mdev); - + write_lock_irq(&global_state_lock); if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + write_unlock_irq(&global_state_lock); drbd_state_unlock(mdev); return; } - write_lock_irq(&global_state_lock); - ns = mdev->state; + ns.i = mdev->state.i; ns.aftr_isp = !_drbd_may_sync_now(mdev); -- cgit v1.2.3 From a2e9138197405a4c051630416ceebf98158e631d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 6 Oct 2011 17:30:26 +0200 Subject: drbd: fix spurious meta data IO "error" When detaching, even cleanly detaching due to administrator request, we always go through D_FAILED before we become D_DISKLESS. Don't let that state change race with an in-flight meta data IO, or that one might think it actually experienced an IO error. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1a81d9ed1b24..00a82ab7ab98 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1351,7 +1351,9 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); + drbd_md_put_buffer(mdev); /* D_FAILED will transition to DISKLESS. */ ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); -- cgit v1.2.3 From f479ea06613514814449f28cba6488e31698e406 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 27 Oct 2011 16:52:30 +0200 Subject: drbd: send intermediate state change results to the peer DRBD state changes schedule after_state_ch() actions to a worker thread, which decides on the old and new states of that change, whether to send an informational state update packet (P_STATE) to the peer. If it decides to drbd_send_state(), it would however always send the _curent_ state, which, if a second state change happens before the after_state_ch() of the first ran, may "fast-forward" the peer's view about this node. In most cases that is harmless, but sometimes this can confuse DRBD, for example into not actually starting a necessary resync if you do a very tight detach/attach loop on a Connected Secondary. Fix this by always sending the "new" state of the respective state transition which scheduled this after_state_ch() work. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 4 +-- drivers/block/drbd/drbd_main.c | 53 ++++++++++++++++++++++++++++++-------- drivers/block/drbd/drbd_nl.c | 2 +- drivers/block/drbd/drbd_receiver.c | 4 +-- 4 files changed, 47 insertions(+), 16 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index c7976a77dfba..31dee20f3411 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1229,8 +1229,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); -extern int _drbd_send_state(struct drbd_conf *mdev); -extern int drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); +extern int drbd_send_current_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 64318d4ca9ec..3a5b4dec529f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1487,7 +1487,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* No point in queuing send_bitmap if we don't have a connection * anymore, so check also the _current_ state, not only the new state @@ -1552,14 +1552,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { drbd_send_sizes(mdev, 0, 0); /* to start sync... */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* We want to pause/continue resync, tell peer. */ if (ns.conn >= C_CONNECTED && ((os.aftr_isp != ns.aftr_isp) || (os.user_isp != ns.user_isp))) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* In case one of the isp bits got set, suspend other devices. */ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && @@ -1569,10 +1569,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Make sure the peer gets informed about eventual state changes (ISP bits) while we were in WFReportParams. */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || @@ -1612,7 +1612,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, "ASSERT FAILED: disk is %s during detach\n", drbd_disk_str(mdev->state.disk)); - if (drbd_send_state(mdev)) + if (drbd_send_state(mdev, ns)) dev_info(DEV, "Notified peer that I am detaching my disk\n"); drbd_rs_cancel_all(mdev); @@ -1642,7 +1642,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - if (drbd_send_state(mdev)) + if (drbd_send_state(mdev, ns)) dev_info(DEV, "Notified peer that I'm now diskless.\n"); /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ @@ -1651,7 +1651,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Notify peer that I had a local IO error, and did not detached.. */ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && @@ -1669,7 +1669,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* sync target done with resync. Explicitly notify peer, even though * it should (at least for non-empty resyncs) already know itself. */ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk @@ -2191,10 +2191,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl } /** - * drbd_send_state() - Sends the drbd state to the peer + * drbd_send_current_state() - Sends the drbd state to the peer * @mdev: DRBD device. */ -int drbd_send_state(struct drbd_conf *mdev) +int drbd_send_current_state(struct drbd_conf *mdev) { struct socket *sock; struct p_state p; @@ -2220,6 +2220,37 @@ int drbd_send_state(struct drbd_conf *mdev) return ok; } +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @mdev: DRBD device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) +{ + struct socket *sock; + struct p_state p; + int ok = 0; + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(state.i); + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header80 *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + return ok; +} + int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) { diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 00a82ab7ab98..1bbbad302ae7 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) /* if this was forced, we should consider sync */ if (forced) drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } drbd_md_sync(mdev); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1a48e02b83bc..f0d86cb300cf 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -899,7 +899,7 @@ retry: drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); @@ -3294,7 +3294,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned /* Nowadays only used when forcing a node into primary role and setting its disk to UpToDate with that */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } } -- cgit v1.2.3 From e89868a0927cfb8a3f535c938e5d6dd7edc6353c Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 9 Nov 2011 21:04:03 +0100 Subject: drbd: Fixed an obvious copy-n-paste mistake This bug might have caused troubles if disk-barriers and the ahead-behind more are enabled at the same time. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_worker.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index f0d86cb300cf..2e9dfc69828f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -4437,7 +4437,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) if (mdev->state.conn == C_AHEAD && atomic_read(&mdev->ap_in_flight) == 0 && - !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) { + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { mdev->start_resync_timer.expires = jiffies + HZ; add_timer(&mdev->start_resync_timer); } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5fc60f622bed..56fd69e38298 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -746,7 +746,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } drbd_start_resync(mdev, C_SYNC_SOURCE); - clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); return 1; } -- cgit v1.2.3 From 763eb63625a625e4d160cbb4cce2bcdb40141b97 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 2 Nov 2011 16:29:45 +0100 Subject: drbd: fix potential spinlock deadlock drbd_try_clear_on_disk_bm() has a sanity check for the number of blocks left to be resynced (rs_left) in the current resync extent. If it detects a mismatch, it complains, and forces a disconnect using drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); Unfortunately, this may be called while holding the req_lock, and drbd_force_state() want's to aquire that lock itself. Deadlock. Don't force a disconnect, but fix up rs_left by recounting and reassigning the number of dirty blocks in that extent. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 3d7c2153daca..601ad9ef0437 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -711,16 +711,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " - "rs_failed=%d count=%d\n", + dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, - ext->rs_failed, count); - dump_stack(); - - lc_put(mdev->resync, &ext->lce); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return; + ext->rs_failed, count, + drbd_conn_str(mdev->state.conn)); + + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(mdev, enr); } } else { /* Normally this element should be in the cache, -- cgit v1.2.3 From 545752d5d8a2e01b4fcb23a61ec732b2b26cbe1a Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 5 Dec 2011 14:39:25 +0100 Subject: drbd: fix race between disconnect and receive_state If the asender thread, or request_timer_fn(), or some other part of the code, decided to drop the connection (because of timeout or other), but the receiver just now was processing a P_STATE packet, there was a chance that receive_state() would do a hard state change "re-establishing" an already failed connection without additional handshake. Log excerpt: Remote failed to finish a request within ko-count * timeout peer( Secondary -> Unknown ) conn( Connected -> Timeout ) pdsk( UpToDate -> DUnknown ) asender terminated ... peer( Unknown -> Secondary ) conn( Timeout -> Connected ) pdsk( DUnknown -> UpToDate ) peer_isp( 0 -> 1 ) ... Connection closed peer( Secondary -> Unknown ) conn( Connected -> Unconnected ) pdsk( UpToDate -> DUnknown ) peer_isp( 1 -> 0 ) receiver terminated Impact: while the connection state is erroneously "Connected", requests may be queued and even sent, which would never be acknowledged, and may have been missed by the cleanup. These requests would never be completed. The next drbd_suspend_io() will then lock up, waiting forever for these requests to complete. Fixed in several code paths: Make sure the connection state is NetworkFailure or worse before starting the cleanup in drbd_disconnect(). This should make sure the cleanup won't miss any requests. Disallow receive_state() to "upgrade" the connection state from an error state. This will make sure the "illegal" state transition won't happen. For all connection failure states, relax the safe-guard in sanitize_state() again to silently mask out those state changes (e.g. Timeout -> Connected becomes Timeout -> Timeout). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_receiver.c | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 3a5b4dec529f..f71a667f5f32 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -909,7 +909,7 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) ns.conn = os.conn; /* we cannot fail (again) if we already detached */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 2e9dfc69828f..08e694ef5ed9 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3169,6 +3169,12 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return false; + /* If this is the "end of sync" confirmation, usually the peer disk * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits * set) resync started in PausedSyncT, or if the timing of pause-/ @@ -3782,6 +3788,13 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (mdev->state.conn == C_STANDALONE) return; + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + /* asender does not clean up anything. it must not interfere, either */ drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); -- cgit v1.2.3 From 4afc433cf8066c112bd2bdd949d78ff8e8b4ba3f Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 13 Dec 2011 10:31:32 +0100 Subject: drbd: Do not send state packets while lower than C_CONNECTED cstate I.e. in C_WF_REPORT_PARAMS or in C_WF_CONNECTION. Sending may already work in these cstates, but the peer still expects the HandShake / ConnectionFeatures packet. Actually triggered by the Testuite on kugel. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f71a667f5f32..b2c0e5f0d52c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1612,8 +1612,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, "ASSERT FAILED: disk is %s during detach\n", drbd_disk_str(mdev->state.disk)); - if (drbd_send_state(mdev, ns)) - dev_info(DEV, "Notified peer that I am detaching my disk\n"); + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); drbd_rs_cancel_all(mdev); @@ -1642,15 +1642,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - if (drbd_send_state(mdev, ns)) - dev_info(DEV, "Notified peer that I'm now diskless.\n"); + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(mdev); } /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) drbd_send_state(mdev, ns); /* Disks got bigger while they were detached */ -- cgit v1.2.3 From 7caacb69ac468ea713e8e8ba77be8040d8fe7bbe Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 14 Dec 2011 18:01:21 +0100 Subject: drbd: Consider the disk-timeout also for meta-data IO operations If the backing device is already frozen during attach, we failed to recognize that. The current disk-timeout code works on top of the drbd_request objects. During attach we do not allow IO and therefore never generate a drbd_request object but block before that in drbd_make_request(). This patch adds the timeout to all drbd_md_sync_page_io(). Before this patch we used to go from D_ATTACHING directly to D_DISKLESS if IO failed during attach. We can no longer do this since we have to stay in D_FAILED until all IO ops issued to the backing device returned. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 13 ++++++-- drivers/block/drbd/drbd_bitmap.c | 4 +-- drivers/block/drbd/drbd_int.h | 3 +- drivers/block/drbd/drbd_main.c | 64 +++++++++++++++++++--------------------- 4 files changed, 45 insertions(+), 39 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 601ad9ef0437..08bd7c1b36e1 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -88,9 +88,16 @@ static bool md_io_allowed(struct drbd_conf *mdev) return ds >= D_NEGOTIATING || ds == D_ATTACHING; } -void wait_until_done_or_disk_failure(struct drbd_conf *mdev, unsigned int *done) +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done) { - wait_event(mdev->misc_wait, *done || !md_io_allowed(mdev)); + long dt = bdev->dc.disk_timeout * HZ / 10; + if (dt == 0) + dt = MAX_SCHEDULE_TIMEOUT; + + dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); + if (dt == 0) + dev_err(DEV, "meta-data IO operation timed out\n"); } static int _drbd_md_sync_page_io(struct drbd_conf *mdev, @@ -130,7 +137,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_until_done_or_disk_failure(mdev, &mdev->md_io.done); + wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9611db43cc7a..49603bc67fe4 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1088,7 +1088,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id * "in_flight reached zero, all done" event. */ if (!atomic_dec_and_test(&ctx->in_flight)) - wait_until_done_or_disk_failure(mdev, &ctx->done); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); else kref_put(&ctx->kref, &bm_aio_ctx_destroy); @@ -1195,7 +1195,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc } bm_page_io_async(ctx, idx, WRITE_SYNC); - wait_until_done_or_disk_failure(mdev, &ctx->done); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); if (ctx->error) drbd_chk_io_error(mdev, 1, true); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 31dee20f3411..fe5797f73d88 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1541,7 +1541,8 @@ extern void *drbd_md_get_buffer(struct drbd_conf *mdev); extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t sector, int rw); -extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, unsigned int *done); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index b2c0e5f0d52c..8ca8925520ad 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -916,11 +916,6 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (ns.disk == D_FAILED && os.disk == D_DISKLESS) ns.disk = D_DISKLESS; - /* if we are only D_ATTACHING yet, - * we can (and should) go directly to D_DISKLESS. */ - if (ns.disk == D_FAILED && os.disk == D_ATTACHING) - ns.disk = D_DISKLESS; - /* After C_DISCONNECTING only C_STANDALONE may follow */ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) ns.conn = os.conn; @@ -1592,35 +1587,38 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; - int was_io_error; + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS, - * so it is safe to dreference ldev here. */ - eh = mdev->ldev->dc.on_io_error; - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - - /* Immediately allow completion of all application IO, that waits - for completion from the local disk. */ - tl_abort_disk_io(mdev); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (mdev->ldev) { + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + /* Immediately allow completion of all application IO, that waits + for completion from the local disk. */ + tl_abort_disk_io(mdev); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + } put_ldev(mdev); if (was_io_error && eh == EP_CALL_HELPER) -- cgit v1.2.3 From 47a4f1c1bb684a7ed470aba71391d3bd8d77290c Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 12 Jan 2012 23:01:26 +0100 Subject: drbd: Fix module refcount leak in drbd_accept() drbd_accept was modelled after kernel_accept with drbd commit 53eb779 in July 2008. Only, kernel_accept was then broken, and only fixed later with kernel commit 1b08534e in Dec 2008: net: Fix module refcount leak in kernel_accept() Impact: protocol families provided as modules, e.g. ipv6 or ib_sdp, would soon have their reference count become negative, preventing them from being unloaded (likely), or worse, hit zero without actually being unused, allowing them to be unloaded while still in use (unlikely, but if triggered, causing a kernel crash). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 08e694ef5ed9..6b0505a3c4fc 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -466,6 +466,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what, goto out; } (*newsock)->ops = sock->ops; + __module_get((*newsock)->ops->owner); out: return err; -- cgit v1.2.3 From 031a7c173ffda664ac5551bd13c313e513dd87a7 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Sat, 21 Jan 2012 16:50:25 +0100 Subject: drbd: add missing part_round_stats to _drbd_start_io_acct Without this, iostat frequently sees bogus svctime and >= 100% "utilization". Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index ef145e33a647..810070146862 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req const int rw = bio_data_dir(bio); int cpu; cpu = part_stat_lock(); + part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_inc_in_flight(&mdev->vdisk->part0, rw); -- cgit v1.2.3 From fc28845bc005995b41ae8c83c7922d088f0ad228 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 16 Jan 2012 12:14:01 +0100 Subject: drbd: Fix a potential race that could case data inconsistency When we have a write request and a state change C_WF_BITMAP_S -> C_SYNC_SOURCE at the same time, and it happens that the line remote = remote && drbd_should_do_remote(s); stills sees C_WF_BITMAP_S, and send_oos = rw == WRITE && drbd_should_send_oos(s); already sees C_SYNC_SOURCE both are 0. This causes the write to not be mirrored, but marked as out-of-sync on the Sync_Source node. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 810070146862..4fc7f98d9856 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -786,6 +786,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns int local, remote, send_oos = 0; int err = -EIO; int ret = 0; + union drbd_state s; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -847,8 +848,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns drbd_al_begin_io(mdev, sector); } - remote = remote && drbd_should_do_remote(mdev->state); - send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); + s = mdev->state; + remote = remote && drbd_should_do_remote(s); + send_oos = rw == WRITE && drbd_should_send_oos(s); D_ASSERT(!(remote && send_oos)); if (!(local || remote) && !is_susp(mdev->state)) { -- cgit v1.2.3 From b6a370ba0786b5eb09c479bffeffe7baba484ab0 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Sun, 19 Feb 2012 01:27:53 +0100 Subject: drbd: Fix a potential write ordering issue on SyncTarget nodes If a SyncTarget node gets a P_RS_DATA_REPLY before a P_DATA packet for the same sector, it simply submits these two IO requests. This is be possible because on the SyncSource node, the data of the P_RS_DATA_REPLY packet was read from disk. Immediately after that a write request from upper layers came in. The disk scheduler or even the "hardware" queues on the disk drive might reorder these writes. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 6b0505a3c4fc..d601501c336a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1585,6 +1585,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u return ok; } +static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) +{ + + struct drbd_epoch_entry *rs_e; + bool rv = 0; + + spin_lock_irq(&mdev->req_lock); + list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { + if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { + rv = 1; + break; + } + } + spin_unlock_irq(&mdev->req_lock); + + return rv; +} + /* Called from receive_Data. * Synchronize packets on sock with packets on msock. * @@ -1828,6 +1846,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned list_add(&e->w.list, &mdev->active_ee); spin_unlock_irq(&mdev->req_lock); + if (mdev->state.conn == C_SYNC_TARGET) + wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); + switch (mdev->net_conf->wire_protocol) { case DRBD_PROT_C: inc_unacked(mdev); -- cgit v1.2.3 From 001a88687aff26d62f8b61d55c6973618cf0f72f Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 8 Mar 2012 16:43:45 +0100 Subject: drbd: fix potential data corruption and protocol error We assumed only bios with bi_idx == 0 would end up in drbd_make_request(). That is wrong. At least device mapper, in __clone_and_map(), may submit clones only covering a partial bio, but sharing the original bvec, by adjusting bi_idx and relevant other bio members of the clone. We used __bio_for_each_segment() in various places, even though that is documented as * drivers should not use the __ version unless they _really_ want to * run through the entire bio and not just pending pieces Impact: we would send the full bio bvec, even for the clone with bi_idx > 0, which will cause data corruption on the peer (because we submit wrong data at the clone offset), and will cause a DRBD protocol error, disconnect/reconnect and resync (thus fixing the corruption), because the next package header would be expected right in the middle of the sent data, causing DRBD magic mismatch. Fix: drop the assert, and use bio_for_each_segment() instead of the __ version. Conflicts: drbd/drbd_tracing.c Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 4 ++-- drivers/block/drbd/drbd_req.c | 1 - drivers/block/drbd/drbd_worker.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8ca8925520ad..f6cfc45b862a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2734,7 +2734,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_no_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2748,7 +2748,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4fc7f98d9856..de319ba54241 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1106,7 +1106,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) */ D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); - D_ASSERT(bio->bi_idx == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 56fd69e38298..14aa4a2d4dc4 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -308,7 +308,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * sg_init_table(&sg, 1); crypto_hash_init(&desc); - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); crypto_hash_update(&desc, &sg, sg.length); } -- cgit v1.2.3 From 671a74e749af8ca28cae1bfc141f2b3f30b7ad65 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 8 Mar 2012 11:45:57 +0100 Subject: drbd: remove now unused seq_num member from struct drbd_request Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 1 - drivers/block/drbd/drbd_main.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index fe5797f73d88..2f3501169d68 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -712,7 +712,6 @@ struct drbd_request { struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ - int seq_num; unsigned long start_time; }; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f6cfc45b862a..02fd1e109cf6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2814,8 +2814,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; - p.seq_num = cpu_to_be32(req->seq_num = - atomic_add_return(1, &mdev->packet_seq)); + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); -- cgit v1.2.3 From 1abc2af205590b0e9c0a450f67ecb0df7b0f0b5d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 8 Mar 2012 11:47:03 +0100 Subject: drbd: missing wakeup after drbd_rs_del_all Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 08bd7c1b36e1..0a35f82f1222 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -1243,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev) put_ldev(mdev); } spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); return 0; } -- cgit v1.2.3 From a5d214f621d47ffb89d294838006d30869050297 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 8 Mar 2012 11:49:40 +0100 Subject: drbd: remove some very outdated comments Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 2f3501169d68..fff8f790f79d 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -879,13 +879,6 @@ enum bm_flag { /* clear is not expected while bitmap is locked for bulk operation */ }; - -/* TODO sort members for performance - * MAYBE group them further */ - -/* THINK maybe we actually want to use the default "event/%s" worker threads - * or similar in linux 2.6, which uses per cpu data and threads. - */ struct drbd_work_queue { struct list_head q; struct semaphore s; /* producers up it, worker down()s it */ -- cgit v1.2.3 From 7ffcaa7194e2c96a738b936d2ae71c7f0c697c0a Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 8 Mar 2012 12:01:56 +0100 Subject: drbd: remove unused static helper function Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index fff8f790f79d..746f7933bf86 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1751,19 +1751,6 @@ static inline struct page *page_chain_next(struct page *page) #define page_chain_for_each_safe(page, n) \ for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline int drbd_bio_has_active_page(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - __bio_for_each_segment(bvec, bio, i, 0) { - if (page_count(bvec->bv_page) > 1) - return 1; - } - - return 0; -} - static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) { struct page *page = e->pages; @@ -1774,7 +1761,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) return 0; } - static inline void drbd_state_lock(struct drbd_conf *mdev) { wait_event(mdev->misc_wait, -- cgit v1.2.3 From c088b2d904445f501c5aa7a6fc63ca9e8b96f224 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 23 Mar 2012 13:57:13 +0100 Subject: drbd: don't pretend that barrier_nr == 0 was special We used to have a barrier implementation where barrier_nr 0 was reserved. That is long gone. Just use the full sequence space. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 02fd1e109cf6..519f286a4299 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -247,9 +247,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) new->n_writes = 0; newest_before = mdev->newest_tle; - /* never send a barrier number == 0, because that is special-cased - * when using TCQ for our write ordering code */ - new->br_number = (newest_before->br_number+1) ?: 1; + new->br_number = newest_before->br_number+1; if (mdev->newest_tle != new) { mdev->newest_tle->next = new; mdev->newest_tle = new; -- cgit v1.2.3 From 6d49e101fd3d3dbd525564923a82fb8a66676420 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Jan 2012 09:43:25 +0100 Subject: drbd: make OOS_HANDED_TO_NETWORK its own case OOS_HANDED_TO_NETWORK should not be grouped with the various *_CANCELED/*_FAILED cases. Also, not only clear the RQ_NET_QUEUED flag, but also mark it RQ_NET_DONE, so it can be distinguished from a local-only request even after that. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index de319ba54241..bfed0834cd6f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -569,10 +569,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); break; - case oos_handed_to_network: - /* actually the same */ case send_canceled: - /* treat it the same */ case send_failed: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ @@ -602,11 +599,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, } req->rq_state &= ~RQ_NET_QUEUED; req->rq_state |= RQ_NET_SENT; - /* because _drbd_send_zc_bio could sleep, and may want to - * dereference the bio even after the "write_acked_by_peer" and - * "completed_ok" events came in, once we return from - * _drbd_send_zc_bio (drbd_send_dblock), we have to check - * whether it is done already, and end it. */ + _req_may_be_done_not_susp(req, m); + break; + + case oos_handed_to_network: + /* Was not set PENDING, no longer QUEUED, so is now DONE + * as far as this connection is concerned. */ + req->rq_state &= ~RQ_NET_QUEUED; + req->rq_state |= RQ_NET_DONE; _req_may_be_done_not_susp(req, m); break; -- cgit v1.2.3 From 41c4a0035b36d400b79cbd945390a76e909711a7 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Jan 2012 09:46:48 +0100 Subject: drbd: fix READ_RETRY_REMOTE_CANCELED to not complete if device is suspended READ_RETRY_REMOTE_CANCELED needs to be grouped with the other _CANCELED cases, not with CONNECTION_LOST_WHILE_PENDING, as that would complete (fail) the bio even if the device became suspended. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index bfed0834cd6f..2a246ac84d7f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -569,6 +569,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); break; + case read_retry_remote_canceled: case send_canceled: case send_failed: /* real cleanup will be done from tl_clear. just update flags @@ -610,9 +611,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done_not_susp(req, m); break; - case read_retry_remote_canceled: - req->rq_state &= ~RQ_NET_QUEUED; - /* fall through, in case we raced with drbd_disconnect */ case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ -- cgit v1.2.3 From d64957c9a9757642f59aa4a63dadf159b2694bab Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 23 Mar 2012 14:42:19 +0100 Subject: drbd: fix WRITE_ACKED_BY_PEER_AND_SIS to not set RQ_NET_DONE Just because this request happened during a resync does not mean it may pretend to have been barrier-acked. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2a246ac84d7f..cd7687fad9e6 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -627,8 +627,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case write_acked_by_peer_and_sis: - req->rq_state |= RQ_NET_SIS; case conflict_discarded_by_peer: /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential @@ -639,18 +637,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, (unsigned long long)req->sector, req->size); req->rq_state |= RQ_NET_DONE; /* fall through */ + case write_acked_by_peer_and_sis: case write_acked_by_peer: + if (what == write_acked_by_peer_and_sis) + req->rq_state |= RQ_NET_SIS; /* protocol C; successfully written on peer. - * Nothing to do here. + * Nothing more to do here. * We want to keep the tl in place for all protocols, to cater - * for volatile write-back caches on lower level devices. - * - * A barrier request is expected to have forced all prior - * requests onto stable storage, so completion of a barrier - * request could set NET_DONE right here, and not wait for the - * P_BARRIER_ACK, but that is an unnecessary optimization. */ + * for volatile write-back caches on lower level devices. */ - /* this makes it effectively the same as for: */ case recv_acked_by_peer: /* protocol B; pretends to be successfully written on peer. * see also notes above in handed_over_to_network about -- cgit v1.2.3 From 46385c84acd6654d3a38c9c7af1921dbded74aa2 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 16 Jan 2012 15:04:33 +0100 Subject: drbd: move put_ldev from __req_mod() to the endio callback One invocation in the endio handler is good enough, we don't need mention it for each of the different ways it calls __req_mod(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 4 ---- drivers/block/drbd/drbd_worker.c | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index cd7687fad9e6..340d57b98565 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -441,7 +441,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case abort_disk_io: @@ -458,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(mdev, false); _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_ahead_completed_with_error: @@ -466,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_completed_with_error: @@ -478,7 +475,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(!(req->rq_state & RQ_NET_MASK)); __drbd_chk_io_error(mdev, false); - put_ldev(mdev); goto_queue_for_net_read: diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 14aa4a2d4dc4..620c70ff2231 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -244,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error) spin_lock_irqsave(&mdev->req_lock, flags); __req_mod(req, what, &m); spin_unlock_irqrestore(&mdev->req_lock, flags); + put_ldev(mdev); if (m.bio) complete_master_bio(mdev, &m); -- cgit v1.2.3 From 197296ffed71b7d5056d8618a07fec145b040303 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 26 Mar 2012 16:47:11 +0200 Subject: drbd: Delay/reject other state changes while establishing a connection Changes to the role and disk state should be delayed or rejected while we establish a connection. This is necessary, since the peer will base its resync decision on the UUIDs and the state we sent in the drbd_connect() function. The most prominent example for this race is becoming primary after sending state and UUIDs and before the state changes to C_WF_CONNECTION. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 1 + drivers/block/drbd/drbd_main.c | 13 +++++++++++++ drivers/block/drbd/drbd_nl.c | 2 +- drivers/block/drbd/drbd_receiver.c | 10 +++++++++- 4 files changed, 24 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 746f7933bf86..f215ad430bb8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -850,6 +850,7 @@ enum { NEW_CUR_UUID, /* Create new current UUID when thawing IO */ AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + STATE_SENT, /* Do not change state/UUIDs while this is set */ }; struct drbd_bitmap; /* opaque for drbd_conf */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 519f286a4299..deccff3af774 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -841,6 +841,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) rv = SS_IN_TRANSIENT_STATE; + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... */ + if (test_bit(STATE_SENT, &mdev->flags) && + !(os.conn == C_WF_REPORT_PARAMS || + (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + rv = SS_IN_TRANSIENT_STATE; + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -1668,6 +1675,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) drbd_send_state(mdev, ns); + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &mdev->flags); + wake_up(&mdev->state_wait); + } + /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk * failure, or because of connection loss. diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1bbbad302ae7..308beb8c0c1e 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data) */ spin_lock_irq(&mdev->req_lock); ns = mdev->state; - if (ns.conn < C_WF_REPORT_PARAMS) { + if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { ns.pdsk = nps; _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d601501c336a..9db93ff11c02 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -751,6 +751,7 @@ static int drbd_connect(struct drbd_conf *mdev) { struct socket *s, *sock, *msock; int try, h, ok; + enum drbd_state_rv rv; D_ASSERT(!mdev->data.socket); @@ -897,6 +898,7 @@ retry: if (drbd_send_protocol(mdev) == -1) return -1; + set_bit(STATE_SENT, &mdev->flags); drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); @@ -904,7 +906,13 @@ retry: clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); - if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) + spin_lock_irq(&mdev->req_lock); + rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); + if (mdev->state.conn != C_WF_REPORT_PARAMS) + clear_bit(STATE_SENT, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) return 0; drbd_thread_start(&mdev->asender); -- cgit v1.2.3 From 5de738272e38f7051c7a44c42631b71a0e2a1e80 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 28 Mar 2012 10:17:32 +0200 Subject: drbd: Ensure that data_size is not 0 before using data_size-1 as index This could be exploited by a peer which runs modified code. Reported-by: Dan Carpenter Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9db93ff11c02..017eeb745ed9 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2837,10 +2837,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi if (apv >= 88) { if (apv == 88) { - if (data_size > SHARED_SECRET_MAX) { - dev_err(DEV, "verify-alg too long, " - "peer wants %u, accepting only %u byte\n", - data_size, SHARED_SECRET_MAX); + if (data_size > SHARED_SECRET_MAX || data_size == 0) { + dev_err(DEV, "verify-alg of wrong size, " + "peer wants %u, accepting only up to %u byte\n", + data_size, SHARED_SECRET_MAX); return false; } -- cgit v1.2.3 From ba280c092e6eca8a70c502e4510061535fdce382 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 25 Apr 2012 11:46:14 +0200 Subject: drbd: fix resend/resubmit of frozen IO DRBD can freeze IO, due to fencing policy (fencing resource-and-stonith), or because we lost access to data (on-no-data-accessible suspend-io). Resuming from there (re-connect, or re-attach, or explicit admin intervention) should "just work". Unfortunately, if the re-attach/re-connect did not happen within the timeout, since the commit drbd: Implemented real timeout checking for request processing time if so configured, the request_timer_fn() would timeout and detach/disconnect virtually immediately. This change tracks the most recent attach and connect, and does not timeout within after attach/connect. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 2 ++ drivers/block/drbd/drbd_main.c | 9 ++++++++ drivers/block/drbd/drbd_req.c | 52 ++++++++++++++++++++++++++++++------------ 3 files changed, 48 insertions(+), 15 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f215ad430bb8..302a6e786f76 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1049,6 +1049,8 @@ struct drbd_conf { struct crypto_hash *csums_tfm; struct crypto_hash *verify_tfm; + unsigned long last_reattach_jif; + unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread asender; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index deccff3af774..ab501b23b50e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1326,6 +1326,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) drbd_resume_al(mdev); + /* remember last connect and attach times so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) + mdev->last_reconnect_jif = jiffies; + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + mdev->last_reattach_jif = jiffies; + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 340d57b98565..4a642ce62bae 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1200,43 +1200,65 @@ void request_timer_fn(unsigned long data) struct drbd_request *req; /* oldest request */ struct list_head *le; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + unsigned long now; if (get_net_conf(mdev)) { - ent = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; + if (mdev->state.conn >= C_WF_REPORT_PARAMS) + ent = mdev->net_conf->timeout*HZ/10 + * mdev->net_conf->ko_count; put_net_conf(mdev); } - if (get_ldev(mdev)) { + if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ dt = mdev->ldev->dc.disk_timeout * HZ / 10; put_ldev(mdev); } et = min_not_zero(dt, ent); - if (!et || (mdev->state.conn < C_WF_REPORT_PARAMS && mdev->state.disk <= D_FAILED)) + if (!et) return; /* Recurring timer stopped */ + now = jiffies; + spin_lock_irq(&mdev->req_lock); le = &mdev->oldest_tle->requests; if (list_empty(le)) { spin_unlock_irq(&mdev->req_lock); - mod_timer(&mdev->request_timer, jiffies + et); + mod_timer(&mdev->request_timer, now + et); return; } le = le->prev; req = list_entry(le, struct drbd_request, tl_requests); - if (ent && req->rq_state & RQ_NET_PENDING) { - if (time_is_before_eq_jiffies(req->start_time + ent)) { - dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); - } + + /* The request is considered timed out, if + * - we have some effective timeout from the configuration, + * with above state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + if (ent && req->rq_state & RQ_NET_PENDING && + time_after(now, req->start_time + ent) && + !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { + dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - if (dt && req->rq_state & RQ_LOCAL_PENDING) { - if (time_is_before_eq_jiffies(req->start_time + dt)) { - dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); - __drbd_chk_io_error(mdev, 1); - } + if (dt && req->rq_state & RQ_LOCAL_PENDING && + time_after(now, req->start_time + dt) && + !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { + dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(mdev, 1); } - nt = (time_is_before_eq_jiffies(req->start_time + et) ? jiffies : req->start_time) + et; + nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); mod_timer(&mdev->request_timer, nt); } -- cgit v1.2.3 From a574daf5d722f4ca8cc18509f30b804c4d519962 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 25 Apr 2012 16:27:35 +0200 Subject: drbd: fix race between drbdadm invalidate/verify and finishing resync When a resync or online verify is finished or aborted, drbd does a bulk write-out of changed bitmap pages. If *in that very moment* a new verify or resync is triggered, this can race: ASSERT( !test_bit(BITMAP_IO, &mdev->flags) ) in drbd_main.c FIXME going to queue 'set_n_write from StartingSync' but 'write from resync_finished' still pending? and similar. This can be observed with e.g. tight invalidate loops in test scripts, and probably has no real-life implication. Still, that race can be solved by first quiescen the device, before starting a new resync or verify. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 308beb8c0c1e..867bf1d82988 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1963,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -1981,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2002,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); @@ -2020,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2192,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); /* w_make_ov_request expects position to be aligned */ mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + drbd_resume_io(mdev); return 0; } -- cgit v1.2.3 From 0e8488ade26b4b16a9745aa15ecb88c3fb1cb953 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 25 Apr 2012 23:06:45 +0200 Subject: drbd: allow bitmap to change during writeout from resync_finished Symptom: messages similar to "FIXME asender in bm_change_bits_to, bitmap locked for 'write from resync_finished' by worker" If a resync or verify is finished (or aborted), a full bitmap writeout is triggered. If we have ongoing local IO, the bitmap may still change during that writeout, pending and not yet processed acks may cause bits to be cleared, while new writes may cause bits to be to be set. To fix this, introduce the drbd_bm_write_copy_pages() variant. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 26 +++++++++++++++++++++----- drivers/block/drbd/drbd_int.h | 15 +++++++++++---- drivers/block/drbd/drbd_main.c | 4 ++-- 3 files changed, 34 insertions(+), 11 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 49603bc67fe4..9dab3700ca2d 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1011,7 +1011,7 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) +static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { struct bm_aio_ctx *ctx; struct drbd_bitmap *b = mdev->bitmap; @@ -1037,7 +1037,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id .mdev = mdev, .in_flight = ATOMIC_INIT(1), .done = 0, - .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, + .flags = flags, .error = 0, .kref = { ATOMIC_INIT(2) }, }; @@ -1128,7 +1128,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id */ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, READ, 0); + return bm_rw(mdev, READ, 0, 0); } /** @@ -1139,7 +1139,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, WRITE, 0); + return bm_rw(mdev, WRITE, 0, 0); } /** @@ -1149,7 +1149,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) { - return bm_rw(mdev, WRITE, upper_idx); + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will only write pages that have changed since last IO. + * In contrast to drbd_bm_write(), this will copy the bitmap pages + * to temporary writeout pages. It is intended to trigger a full write-out + * while still allowing the bitmap to change, for example if a resync or online + * verify is aborted due to a failed peer disk, while local IO continues, or + * pending resync acks are still being processed. + */ +int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 302a6e786f76..6586053429b4 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -862,22 +862,28 @@ enum bm_flag { BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ /* currently locked for bulk operation */ - BM_LOCKED_MASK = 0x7, + BM_LOCKED_MASK = 0xf, /* in detail, that is: */ BM_DONT_CLEAR = 0x1, BM_DONT_SET = 0x2, BM_DONT_TEST = 0x4, + /* so we can mark it locked for bulk operation, + * and still allow all non-bulk operations */ + BM_IS_LOCKED = 0x8, + /* (test bit, count bit) allowed (common case) */ - BM_LOCKED_TEST_ALLOWED = 0x3, + BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, /* testing bits, as well as setting new bits allowed, but clearing bits * would be unexpected. Used during bitmap receive. Setting new bits * requires sending of "out-of-sync" information, though. */ - BM_LOCKED_SET_ALLOWED = 0x1, + BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, - /* clear is not expected while bitmap is locked for bulk operation */ + /* for drbd_bm_write_copy_pages, everything is allowed, + * only concurrent bulk operations are locked out. */ + BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, }; struct drbd_work_queue { @@ -1457,6 +1463,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ab501b23b50e..d830116781f9 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1698,8 +1698,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, * No harm done if some bits change during this phase. */ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, - "write from resync_finished", BM_LOCKED_SET_ALLOWED); + drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(mdev); } -- cgit v1.2.3 From 4281808fb3580c381a23cceb0a29ced92d570a5f Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 23 Feb 2011 12:39:46 +0100 Subject: drbd: add page pool to be used for meta data IO Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 23 ++++++++++++++++++++++- drivers/block/drbd/drbd_main.c | 9 +++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6586053429b4..685ed4cca173 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1496,11 +1496,32 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; extern mempool_t *drbd_ee_mempool; -extern struct page *drbd_pp_pool; /* drbd's page pool */ +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; extern spinlock_t drbd_pp_lock; extern int drbd_pp_vacant; extern wait_queue_head_t drbd_pp_wait; +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + extern rwlock_t global_state_lock; extern struct drbd_conf *drbd_new_device(unsigned int minor); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index d830116781f9..ed264c92c3af 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -139,6 +139,7 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -3264,6 +3265,8 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); if (drbd_request_mempool) @@ -3277,6 +3280,7 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; drbd_ee_cache = NULL; @@ -3300,6 +3304,7 @@ static int drbd_create_mempools(void) drbd_bm_ext_cache = NULL; drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -3323,6 +3328,10 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + drbd_request_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_request_cache); if (drbd_request_mempool == NULL) -- cgit v1.2.3 From 4d95a10f97337415c1f74b4901d80e047f8dc128 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 23 Feb 2011 15:38:47 +0100 Subject: drbd: use the newly introduced page pool for bitmap IO Conflicts: drbd/drbd_bitmap.c Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9dab3700ca2d..39a1b0dafff4 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -939,9 +939,8 @@ static void bm_async_io_complete(struct bio *bio, int error) bm_page_unlock_io(mdev, idx); - /* FIXME give back to page pool */ if (ctx->flags & BM_AIO_COPY_PAGES) - put_page(bio->bi_io_vec[0].bv_page); + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); bio_put(bio); @@ -978,10 +977,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - /* FIXME alloc_page is good enough for now, but actually needs - * to use pre-allocated page pool */ void *src, *dest; - page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); dest = kmap_atomic(page); src = kmap_atomic(b->bm_pages[page_nr]); memcpy(dest, src, PAGE_SIZE); @@ -993,6 +990,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = bm_async_io_complete; -- cgit v1.2.3 From 0c7db27920a87a8db73ca521b527617eceec3bca Mon Sep 17 00:00:00 2001 From: Arne Redlich Date: Fri, 16 Mar 2012 08:19:33 +0100 Subject: drbd: bm_page_async_io: properly initialize page->private If bm_page_async_io is advised to use a new page for I/O (BM_AIO_COPY_PAGES is set), it will get it from a mempool. Once the mempool has to dip into its reserves the page is not reinitialized, i.e. page->private contains garbage, which will lead to various problems once the I/O completes (dereferences of NULL pointers, the submitting thread getting stuck in D-state, ...). Signed-off-by: Arne Redlich Signed-off-by: Lars Ellenberg Signed-off-by: Philipp Reisner --- drivers/block/drbd/drbd_bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 39a1b0dafff4..a24eb787a7a1 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) static void bm_store_page_idx(struct page *page, unsigned long idx) { BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); - page_private(page) |= idx; + set_page_private(page, idx); } static unsigned long bm_page_to_idx(struct page *page) -- cgit v1.2.3 From 3c2f7a856f2e70d2f1bb59f65d97a66047f14f36 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 23 Feb 2011 17:18:24 +0100 Subject: drbd: remove unused define Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ed264c92c3af..c03e87f19e86 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -160,8 +160,6 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) - #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will give tons of false positives. When this is a real functions sparse works. -- cgit v1.2.3 From 9476f39d66041ca8c66546671765b4047bffa895 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 23 Feb 2011 17:02:01 +0100 Subject: drbd: introduce a bio_set to allocate housekeeping bios from Don't rely on availability of bios from the global fs_bio_set, we should use our own bio_set for meta data IO. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_bitmap.c | 3 +-- drivers/block/drbd/drbd_int.h | 6 ++++++ drivers/block/drbd/drbd_main.c | 30 ++++++++++++++++++++++++++++++ drivers/block/drbd/drbd_receiver.c | 6 +++++- 5 files changed, 43 insertions(+), 4 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 0a35f82f1222..e54e31b02b88 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -115,7 +115,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, rw |= REQ_FUA | REQ_FLUSH; rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; ok = (bio_add_page(bio, page, size, 0) == size); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index a24eb787a7a1..b5c5ff53cb57 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -953,8 +953,7 @@ static void bm_async_io_complete(struct bio *bio, int error) static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { - /* we are process context. we always get a bio */ - struct bio *bio = bio_alloc(GFP_NOIO, 1); + struct bio *bio = bio_alloc_drbd(GFP_NOIO); struct drbd_conf *mdev = ctx->mdev; struct drbd_bitmap *b = mdev->bitmap; struct page *page; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 685ed4cca173..02f013a073a7 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1522,6 +1522,12 @@ extern wait_queue_head_t drbd_pp_wait; #define DRBD_MIN_POOL_PAGES 128 extern mempool_t *drbd_md_io_page_pool; +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + extern rwlock_t global_state_lock; extern struct drbd_conf *drbd_new_device(unsigned int minor); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c03e87f19e86..bd380b94fd08 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -140,6 +140,7 @@ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -160,6 +161,25 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; +static void bio_destructor_drbd(struct bio *bio) +{ + bio_free(bio, drbd_md_io_bio_set); +} + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + bio->bi_destructor = bio_destructor_drbd; + return bio; +} + #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will give tons of false positives. When this is a real functions sparse works. @@ -3263,6 +3283,8 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); if (drbd_md_io_page_pool) mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) @@ -3278,6 +3300,7 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_md_io_bio_set = NULL; drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; @@ -3303,6 +3326,7 @@ static int drbd_create_mempools(void) drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -3326,6 +3350,12 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ +#ifdef COMPAT_HAVE_BIOSET_CREATE + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; +#endif + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); if (drbd_md_io_page_pool == NULL) goto Enomem; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 017eeb745ed9..247a79aec895 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1106,7 +1106,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the - * request in more than one bio. */ + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { -- cgit v1.2.3 From f6d0a8dbfdce4b4f28fcb0f689c373874646f87c Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 30 Apr 2012 12:53:52 +0200 Subject: drbd: Restore the request restart logic It got lost with the commit 5a7bbad27a410350e64a2d7f5ec18fc73836c14f "block: remove support for bio remapping from ->make_request" Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4a642ce62bae..9c5c84946b05 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -871,7 +871,7 @@ allocate_barrier: if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of - generic_make_request() to restart processing of this + drbd_make_request() to restart processing of this bio. In the next call to drbd_make_request we sleep in inc_ap_bio() */ ret = 1; @@ -1102,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; if (likely(s_enr == e_enr)) { - inc_ap_bio(mdev, 1); - drbd_make_request_common(mdev, bio, start_time); + do { + inc_ap_bio(mdev, 1); + } while (drbd_make_request_common(mdev, bio, start_time)); return; } -- cgit v1.2.3 From bc4854bc91c9a7f117437215cd8b16a0a5671d93 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 3 Apr 2012 14:13:36 +0800 Subject: drbd: check MODULE for THIS_MODULE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit THIS_MODULE is NULL only when drbd is compiled as built-in, so the #ifdef CONFIG_MODULES should be #ifdef MODULE instead. This fixes the warning: drivers/block/drbd/drbd_main.c: In function ‘drbd_buildtag’: drivers/block/drbd/drbd_main.c:4187:24: warning: the comparison will always evaluate as ‘true’ for the address of ‘__this_module’ will never be NULL [-Waddress] Signed-off-by: WANG Cong Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index bd380b94fd08..920ede2829d6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -4365,12 +4365,11 @@ const char *drbd_buildtag(void) static char buildtag[38] = "\0uilt-in"; if (buildtag[0] == 0) { -#ifdef CONFIG_MODULES - if (THIS_MODULE != NULL) - sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); - else +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; #endif - buildtag[0] = 'b'; } return buildtag; -- cgit v1.2.3 From 92b4ca291f8676c9f323166a65fb7447774b2a46 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 30 Apr 2012 12:53:52 +0200 Subject: drbd: grammar fix in log message Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 2 +- include/linux/drbd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 247a79aec895..1d088c478150 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2455,7 +2455,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; - dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); + dev_info(DEV, "Lost last syncUUID packet, corrected:\n"); drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); return -1; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index cb8728b28432..47e3d4850584 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.11" +#define REL_VERSION "8.3.13" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 96 -- cgit v1.2.3 From 38bf1953987c1735f3c9140fca762949a8cae507 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 4 May 2012 11:34:03 +0000 Subject: connector/userns: replace netlink uses of cap_raised() with capable() In 2009 Philip Reiser notied that a few users of netlink connector interface needed a capability check and added the idiom cap_raised(nsp->eff_cap, CAP_SYS_ADMIN) to a few of them, on the premise that netlink was asynchronous. In 2011 Patrick McHardy noticed we were being silly because netlink is synchronous and removed eff_cap from the netlink_skb_params and changed the idiom to cap_raised(current_cap(), CAP_SYS_ADMIN). Looking at those spots with a fresh eye we should be calling capable(CAP_SYS_ADMIN). The only reason I can see for not calling capable is that it once appeared we were not in the same task as the caller which would have made calling capable() impossible. In the initial user_namespace the only difference between between cap_raised(current_cap(), CAP_SYS_ADMIN) and capable(CAP_SYS_ADMIN) are a few sanity checks and the fact that capable(CAP_SYS_ADMIN) sets PF_SUPERPRIV if we use the capability. Since we are going to be using root privilege setting PF_SUPERPRIV seems the right thing to do. The motivation for this that patch is that in a child user namespace cap_raised(current_cap(),...) tests your capabilities with respect to that child user namespace not capabilities in the initial user namespace and thus will allow processes that should be unprivielged to use the kernel services that are only protected with cap_raised(current_cap(),..). To fix possible user_namespace issues and to just clean up the code replace cap_raised(current_cap(), CAP_SYS_ADMIN) with capable(CAP_SYS_ADMIN). Signed-off-by: Eric W. Biederman Cc: Patrick McHardy Cc: Philipp Reisner Acked-by: Serge E. Hallyn Acked-by: Andrew G. Morgan Cc: Vasiliy Kulikov Cc: David Howells Reviewed-by: James Morris Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_nl.c | 2 +- drivers/md/dm-log-userspace-transfer.c | 2 +- drivers/video/uvesafb.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block/drbd') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index abfaacaaf346..946166e13953 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2297,7 +2297,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms return; } - if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) { + if (!capable(CAP_SYS_ADMIN)) { retcode = ERR_PERM; goto fail; } diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 1f23e048f077..08d9a207259a 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -134,7 +134,7 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) { struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); - if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return; spin_lock(&receiving_list_lock); diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c index 26e83d7fdd6f..b0e2a4261afe 100644 --- a/drivers/video/uvesafb.c +++ b/drivers/video/uvesafb.c @@ -73,7 +73,7 @@ static void uvesafb_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *ns struct uvesafb_task *utask; struct uvesafb_ktask *task; - if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return; if (msg->seq >= UVESAFB_TASKS_MAX) -- cgit v1.2.3