diff options
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r-- | drivers/block/drbd/drbd_debugfs.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 19 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 30 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 133 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_protocol.h | 47 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 253 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 19 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 11 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.h | 5 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 2 |
11 files changed, 431 insertions, 92 deletions
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 5d5e8d6a8a56..f13b48ff5f43 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -237,6 +237,8 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL"); seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); + seq_print_rq_state_bit(m, f & EE_TRIM, &sep, "trim"); + seq_print_rq_state_bit(m, f & EE_ZEROOUT, &sep, "zero-out"); seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same"); seq_putc(m, '\n'); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 1e47db57b9d2..000a2f4c0e92 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -430,7 +430,11 @@ enum { __EE_MAY_SET_IN_SYNC, /* is this a TRIM aka REQ_OP_DISCARD? */ - __EE_IS_TRIM, + __EE_TRIM, + /* explicit zero-out requested, or + * our lower level cannot handle trim, + * and we want to fall back to zeroout instead */ + __EE_ZEROOUT, /* In case a barrier failed, * we need to resubmit without the barrier flag. */ @@ -472,7 +476,8 @@ enum { }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) -#define EE_IS_TRIM (1<<__EE_IS_TRIM) +#define EE_TRIM (1<<__EE_TRIM) +#define EE_ZEROOUT (1<<__EE_ZEROOUT) #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) @@ -1556,6 +1561,8 @@ extern void start_resync_timer_fn(struct timer_list *t); extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ +extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, + sector_t start, unsigned int nr_sectors, int flags); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); extern void drbd_send_ping_wf(struct work_struct *ws); @@ -1609,13 +1616,7 @@ static inline void drbd_tcp_quickack(struct socket *sock) } /* sets the number of 512 byte sectors of our virtual device */ -static inline void drbd_set_my_capacity(struct drbd_device *device, - sector_t size) -{ - /* set_capacity(device->this_bdev->bd_disk, size); */ - set_capacity(device->vdisk, size); - device->this_bdev->bd_inode->i_size = (loff_t)size << 9; -} +void drbd_set_my_capacity(struct drbd_device *device, sector_t size); /* * used to submit our private bio diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa8204214ac0..714eb64fabfd 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1668,7 +1668,11 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, (bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) | (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) | (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) | - (bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0); + (bio_op(bio) == REQ_OP_WRITE_ZEROES ? + ((connection->agreed_features & DRBD_FF_WZEROES) ? + (DP_ZEROES |(!(bio->bi_opf & REQ_NOUNMAP) ? DP_DISCARD : 0)) + : DP_DISCARD) + : 0); else return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0; } @@ -1712,10 +1716,11 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * } p->dp_flags = cpu_to_be32(dp_flags); - if (dp_flags & DP_DISCARD) { + if (dp_flags & (DP_DISCARD|DP_ZEROES)) { + enum drbd_packet cmd = (dp_flags & DP_ZEROES) ? P_ZEROES : P_TRIM; struct p_trim *t = (struct p_trim*)p; t->size = cpu_to_be32(req->i.size); - err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); + err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*t), NULL, 0); goto out; } if (dp_flags & DP_WSAME) { @@ -2034,6 +2039,21 @@ void drbd_init_set_defaults(struct drbd_device *device) device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; } +static void _drbd_set_my_capacity(struct drbd_device *device, sector_t size) +{ + /* set_capacity(device->this_bdev->bd_disk, size); */ + set_capacity(device->vdisk, size); + device->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +void drbd_set_my_capacity(struct drbd_device *device, sector_t size) +{ + char ppb[10]; + _drbd_set_my_capacity(device, size); + drbd_info(device, "size = %s (%llu KB)\n", + ppsize(ppb, size>>1), (unsigned long long)size>>1); +} + void drbd_device_cleanup(struct drbd_device *device) { int i; @@ -2059,7 +2079,7 @@ void drbd_device_cleanup(struct drbd_device *device) } D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL); - drbd_set_my_capacity(device, 0); + _drbd_set_my_capacity(device, 0); if (device->bitmap) { /* maybe never allocated. */ drbd_bm_resize(device, 0, 1); @@ -2792,7 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock); + q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); if (!q) goto out_no_q; device->rq_queue = q; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index d15703b1ffe8..f2471172a961 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -127,6 +127,35 @@ static int drbd_msg_put_info(struct sk_buff *skb, const char *info) return 0; } +__printf(2, 3) +static int drbd_msg_sprintf_info(struct sk_buff *skb, const char *fmt, ...) +{ + va_list args; + struct nlattr *nla, *txt; + int err = -EMSGSIZE; + int len; + + nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); + if (!nla) + return err; + + txt = nla_reserve(skb, T_info_text, 256); + if (!txt) { + nla_nest_cancel(skb, nla); + return err; + } + va_start(args, fmt); + len = vscnprintf(nla_data(txt), 256, fmt, args); + va_end(args); + + /* maybe: retry with larger reserve, if truncated */ + txt->nla_len = nla_attr_size(len+1); + nlmsg_trim(skb, (char*)txt + NLA_ALIGN(txt->nla_len)); + nla_nest_end(skb, nla); + + return 0; +} + /* This would be a good candidate for a "pre_doit" hook, * and per-family private info->pointers. * But we need to stay compatible with older kernels. @@ -668,14 +697,15 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for if (rv == SS_TWO_PRIMARIES) { /* Maybe the peer is detected as dead very soon... retry at most once more in this case. */ - int timeo; - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; - rcu_read_unlock(); - schedule_timeout_interruptible(timeo); - if (try < max_tries) + if (try < max_tries) { + int timeo; try = max_tries - 1; + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); + } continue; } if (rv < SS_SUCCESS) { @@ -921,7 +951,6 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct } prev; sector_t u_size, size; struct drbd_md *md = &device->ldev->md; - char ppb[10]; void *buffer; int md_moved, la_size_changed; @@ -999,8 +1028,6 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct /* racy, see comments above. */ drbd_set_my_capacity(device, size); md->la_size_sect = size; - drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), - (unsigned long long)size>>1); } if (rv <= DS_ERROR) goto err_out; @@ -1234,6 +1261,21 @@ static void fixup_discard_if_not_supported(struct request_queue *q) } } +static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) +{ + /* Fixup max_write_zeroes_sectors after blk_queue_stack_limits(): + * if we can handle "zeroes" efficiently on the protocol, + * we want to do that, even if our backend does not announce + * max_write_zeroes_sectors itself. */ + struct drbd_connection *connection = first_peer_device(device)->connection; + /* If the peer announces WZEROES support, use it. Otherwise, rather + * send explicit zeroes than rely on some discard-zeroes-data magic. */ + if (connection->agreed_features & DRBD_FF_WZEROES) + q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; + else + q->limits.max_write_zeroes_sectors = 0; +} + static void decide_on_write_same_support(struct drbd_device *device, struct request_queue *q, struct request_queue *b, struct o_qlim *o, @@ -1344,6 +1386,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi } } fixup_discard_if_not_supported(q); + fixup_write_zeroes(device, q); } void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) @@ -1514,6 +1557,30 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis } } +static int disk_opts_check_al_size(struct drbd_device *device, struct disk_conf *dc) +{ + int err = -EBUSY; + + if (device->act_log && + device->act_log->nr_elements == dc->al_extents) + return 0; + + drbd_suspend_io(device); + /* If IO completion is currently blocked, we would likely wait + * "forever" for the activity log to become unused. So we don't. */ + if (atomic_read(&device->ap_bio_cnt)) + goto out; + + wait_event(device->al_wait, lc_try_lock(device->act_log)); + drbd_al_shrink(device); + err = drbd_check_al_size(device, dc); + lc_unlock(device->act_log); + wake_up(&device->al_wait); +out: + drbd_resume_io(device); + return err; +} + int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1576,15 +1643,12 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) } } - drbd_suspend_io(device); - wait_event(device->al_wait, lc_try_lock(device->act_log)); - drbd_al_shrink(device); - err = drbd_check_al_size(device, new_disk_conf); - lc_unlock(device->act_log); - wake_up(&device->al_wait); - drbd_resume_io(device); - + err = disk_opts_check_al_size(device, new_disk_conf); if (err) { + /* Could be just "busy". Ignore? + * Introduce dedicated error code? */ + drbd_msg_put_info(adm_ctx.reply_skb, + "Try again without changing current al-extents setting"); retcode = ERR_NOMEM; goto fail_unlock; } @@ -1934,9 +1998,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } } - if (device->state.conn < C_CONNECTED && - device->state.role == R_PRIMARY && device->ed_uuid && - (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { + if (device->state.pdsk != D_UP_TO_DATE && device->ed_uuid && + (device->state.role == R_PRIMARY || device->state.peer == R_PRIMARY) && + (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { drbd_err(device, "Can only attach to data with current UUID=%016llX\n", (unsigned long long)device->ed_uuid); retcode = ERR_DATA_NOT_CURRENT; @@ -1950,11 +2014,21 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } /* Prevent shrinking of consistent devices ! */ - if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && - drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { - drbd_warn(device, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TOO_SMALL; - goto force_diskless_dec; + { + unsigned long long nsz = drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0); + unsigned long long eff = nbc->md.la_size_sect; + if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && nsz < eff) { + if (nsz == nbc->disk_conf->disk_size) { + drbd_warn(device, "truncating a consistent device during attach (%llu < %llu)\n", nsz, eff); + } else { + drbd_warn(device, "refusing to truncate a consistent device (%llu < %llu)\n", nsz, eff); + drbd_msg_sprintf_info(adm_ctx.reply_skb, + "To-be-attached device has last effective > current size, and is consistent\n" + "(%llu > %llu sectors). Refusing to attach.", eff, nsz); + retcode = ERR_IMPLICIT_SHRINK; + goto force_diskless_dec; + } + } } lock_all_resources(); @@ -2654,8 +2728,10 @@ out: static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force) { + enum drbd_conns cstate; enum drbd_state_rv rv; +repeat: rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), force ? CS_HARD : 0); @@ -2673,6 +2749,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection break; case SS_CW_FAILED_BY_PEER: + spin_lock_irq(&connection->resource->req_lock); + cstate = connection->cstate; + spin_unlock_irq(&connection->resource->req_lock); + if (cstate <= C_WF_CONNECTION) + goto repeat; /* The peer probably wants to see us outdated. */ rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, disk, D_OUTDATED), 0); diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 48dabbb21e11..e6fc5ad72501 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -70,6 +70,11 @@ enum drbd_packet { * we may fall back to an opencoded loop instead. */ P_WSAME = 0x34, + /* 0x35 already claimed in DRBD 9 */ + P_ZEROES = 0x36, /* data sock: zero-out, WRITE_ZEROES */ + + /* 0x40 .. 0x48 already claimed in DRBD 9 */ + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -130,6 +135,12 @@ struct p_header100 { #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ #define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */ +#define DP_ZEROES 1024 /* equiv. REQ_OP_WRITE_ZEROES */ + +/* possible combinations: + * REQ_OP_WRITE_ZEROES: DP_DISCARD | DP_ZEROES + * REQ_OP_WRITE_ZEROES + REQ_NOUNMAP: DP_ZEROES + */ struct p_data { u64 sector; /* 64 bits sector number */ @@ -197,6 +208,42 @@ struct p_block_req { */ #define DRBD_FF_WSAME 4 +/* supports REQ_OP_WRITE_ZEROES on the "wire" protocol. + * + * We used to map that to "discard" on the sending side, and if we cannot + * guarantee that discard zeroes data, the receiving side would map discard + * back to zero-out. + * + * With the introduction of REQ_OP_WRITE_ZEROES, + * we started to use that for both WRITE_ZEROES and DISCARDS, + * hoping that WRITE_ZEROES would "do what we want", + * UNMAP if possible, zero-out the rest. + * + * The example scenario is some LVM "thin" backend. + * + * While an un-allocated block on dm-thin reads as zeroes, on a dm-thin + * with "skip_block_zeroing=true", after a partial block write allocated + * that block, that same block may well map "undefined old garbage" from + * the backends on LBAs that have not yet been written to. + * + * If we cannot distinguish between zero-out and discard on the receiving + * side, to avoid "undefined old garbage" to pop up randomly at later times + * on supposedly zero-initialized blocks, we'd need to map all discards to + * zero-out on the receiving side. But that would potentially do a full + * alloc on thinly provisioned backends, even when the expectation was to + * unmap/trim/discard/de-allocate. + * + * We need to distinguish on the protocol level, whether we need to guarantee + * zeroes (and thus use zero-out, potentially doing the mentioned full-alloc), + * or if we want to put the emphasis on discard, and only do a "best effort + * zeroing" (by "discarding" blocks aligned to discard-granularity, and zeroing + * only potential unaligned head and tail clippings), to at least *try* to + * avoid "false positives" in an online-verify later, hoping that someone + * set skip_block_zeroing=false. + */ +#define DRBD_FF_WZEROES 8 + + struct p_connection_features { u32 protocol_min; u32 feature_flags; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 61c392752fe4..c7ad88d91a09 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -50,7 +50,7 @@ #include "drbd_req.h" #include "drbd_vli.h" -#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) +#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES) struct packet_info { enum drbd_packet cmd; @@ -1490,14 +1490,129 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } -static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) +/* + * Mapping "discard" to ZEROOUT with UNMAP does not work for us: + * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it + * will directly go to fallback mode, submitting normal writes, and + * never even try to UNMAP. + * + * And dm-thin does not do this (yet), mostly because in general it has + * to assume that "skip_block_zeroing" is set. See also: + * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html + * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html + * + * We *may* ignore the discard-zeroes-data setting, if so configured. + * + * Assumption is that this "discard_zeroes_data=0" is only because the backend + * may ignore partial unaligned discards. + * + * LVM/DM thin as of at least + * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) + * Library version: 1.02.93-RHEL7 (2015-01-28) + * Driver version: 4.29.0 + * still behaves this way. + * + * For unaligned (wrt. alignment and granularity) or too small discards, + * we zero-out the initial (and/or) trailing unaligned partial chunks, + * but discard all the aligned full chunks. + * + * At least for LVM/DM thin, with skip_block_zeroing=false, + * the result is effectively "discard_zeroes_data=1". + */ +/* flags: EE_TRIM|EE_ZEROOUT */ +int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) { struct block_device *bdev = device->ldev->backing_bdev; + struct request_queue *q = bdev_get_queue(bdev); + sector_t tmp, nr; + unsigned int max_discard_sectors, granularity; + int alignment; + int err = 0; - if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9, - GFP_NOIO, 0)) - peer_req->flags |= EE_WAS_ERROR; + if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM)) + goto zero_out; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); + max_discard_sectors -= max_discard_sectors % granularity; + if (unlikely(!max_discard_sectors)) + goto zero_out; + + if (nr_sectors < granularity) + goto zero_out; + + tmp = start; + if (sector_div(tmp, granularity) != alignment) { + if (nr_sectors < 2*granularity) + goto zero_out; + /* start + gran - (start + gran - align) % gran */ + tmp = start + granularity - alignment; + tmp = start + granularity - sector_div(tmp, granularity); + + nr = tmp - start; + /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many + * layers are below us, some may have smaller granularity */ + err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start = tmp; + } + while (nr_sectors >= max_discard_sectors) { + err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0); + nr_sectors -= max_discard_sectors; + start += max_discard_sectors; + } + if (nr_sectors) { + /* max_discard_sectors is unsigned int (and a multiple of + * granularity, we made sure of that above already); + * nr is < max_discard_sectors; + * I don't need sector_div here, even though nr is sector_t */ + nr = nr_sectors; + nr -= (unsigned int)nr % granularity; + if (nr) { + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start += nr; + } + } + zero_out: + if (nr_sectors) { + err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, + (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP); + } + return err != 0; +} +static bool can_do_reliable_discards(struct drbd_device *device) +{ + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + struct disk_conf *dc; + bool can_do; + + if (!blk_queue_discard(q)) + return false; + + rcu_read_lock(); + dc = rcu_dereference(device->ldev->disk_conf); + can_do = dc->discard_zeroes_if_aligned; + rcu_read_unlock(); + return can_do; +} + +static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + /* If the backend cannot discard, or does not guarantee + * read-back zeroes in discarded ranges, we fall back to + * zero-out. Unless configuration specifically requested + * otherwise. */ + if (!can_do_reliable_discards(device)) + peer_req->flags |= EE_ZEROOUT; + + if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, + peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM))) + peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); } @@ -1550,7 +1665,7 @@ int drbd_submit_peer_request(struct drbd_device *device, * Correctness first, performance later. Next step is to code an * asynchronous variant of the same. */ - if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { + if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) { /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(peer_req->peer_device->connection); @@ -1567,8 +1682,8 @@ int drbd_submit_peer_request(struct drbd_device *device, spin_unlock_irq(&device->resource->req_lock); } - if (peer_req->flags & EE_IS_TRIM) - drbd_issue_peer_discard(device, peer_req); + if (peer_req->flags & (EE_TRIM|EE_ZEROOUT)) + drbd_issue_peer_discard_or_zero_out(device, peer_req); else /* EE_WRITE_SAME */ drbd_issue_peer_wsame(device, peer_req); return 0; @@ -1765,6 +1880,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; + struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL; struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; digest_size = 0; @@ -1786,6 +1902,10 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!expect(data_size == 0)) return NULL; ds = be32_to_cpu(trim->size); + } else if (zeroes) { + if (!expect(data_size == 0)) + return NULL; + ds = be32_to_cpu(zeroes->size); } else if (wsame) { if (data_size != queue_logical_block_size(device->rq_queue)) { drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", @@ -1802,7 +1922,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!expect(IS_ALIGNED(ds, 512))) return NULL; - if (trim || wsame) { + if (trim || wsame || zeroes) { if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) return NULL; } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) @@ -1827,7 +1947,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, peer_req->flags |= EE_WRITE; if (trim) { - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; + return peer_req; + } + if (zeroes) { + peer_req->flags |= EE_ZEROOUT; return peer_req; } if (wsame) @@ -2326,8 +2450,12 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf) static unsigned long wire_flags_to_bio_op(u32 dpf) { - if (dpf & DP_DISCARD) + if (dpf & DP_ZEROES) return REQ_OP_WRITE_ZEROES; + if (dpf & DP_DISCARD) + return REQ_OP_DISCARD; + if (dpf & DP_WSAME) + return REQ_OP_WRITE_SAME; else return REQ_OP_WRITE; } @@ -2518,8 +2646,19 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * op_flags = wire_flags_to_bio_flags(dp_flags); if (pi->cmd == P_TRIM) { D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, op == REQ_OP_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + /* need to play safe: an older DRBD sender + * may mean zero-out while sending P_TRIM. */ + if (0 == (connection->agreed_features & DRBD_FF_WZEROES)) + peer_req->flags |= EE_ZEROOUT; + } else if (pi->cmd == P_ZEROES) { + D_ASSERT(peer_device, peer_req->i.size > 0); D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); D_ASSERT(peer_device, peer_req->pages == NULL); + /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ + if (dp_flags & DP_DISCARD) + peer_req->flags |= EE_TRIM; } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); @@ -2587,7 +2726,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * * we wait for all pending requests, respectively wait for * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ - if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) + if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0) list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); @@ -3364,7 +3503,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, enum drbd_conns rv = C_MASK; enum drbd_disk_state mydisk; struct net_conf *nc; - int hg, rule_nr, rr_conflict, tentative; + int hg, rule_nr, rr_conflict, tentative, always_asbp; mydisk = device->state.disk; if (mydisk == D_NEGOTIATING) @@ -3415,8 +3554,12 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, rcu_read_lock(); nc = rcu_dereference(peer_device->connection->net_conf); + always_asbp = nc->always_asbp; + rr_conflict = nc->rr_conflict; + tentative = nc->tentative; + rcu_read_unlock(); - if (hg == 100 || (hg == -100 && nc->always_asbp)) { + if (hg == 100 || (hg == -100 && always_asbp)) { int pcount = (device->state.role == R_PRIMARY) + (peer_role == R_PRIMARY); int forced = (hg == -100); @@ -3455,9 +3598,6 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, "Sync from %s node\n", (hg < 0) ? "peer" : "this"); } - rr_conflict = nc->rr_conflict; - tentative = nc->tentative; - rcu_read_unlock(); if (hg == -100) { /* FIXME this log message is not correct if we end up here @@ -3623,7 +3763,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in * change. */ - peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, CRYPTO_ALG_ASYNC); + peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0); if (IS_ERR(peer_integrity_tfm)) { peer_integrity_tfm = NULL; drbd_err(connection, "peer data-integrity-alg %s not supported\n", @@ -3980,6 +4120,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL; enum determine_dev_size dd = DS_UNCHANGED; sector_t p_size, p_usize, p_csize, my_usize; + sector_t new_size, cur_size; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; @@ -3987,6 +4128,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info if (!peer_device) return config_unknown_volume(connection, pi); device = peer_device->device; + cur_size = drbd_get_capacity(device->this_bdev); p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); @@ -3997,7 +4139,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info device->p_size = p_size; if (get_ldev(device)) { - sector_t new_size, cur_size; rcu_read_lock(); my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; rcu_read_unlock(); @@ -4012,13 +4153,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info if (device->state.conn == C_WF_REPORT_PARAMS) p_usize = min_not_zero(my_usize, p_usize); - /* Never shrink a device with usable data during connect. - But allow online shrinking if we are connected. */ + /* Never shrink a device with usable data during connect, + * or "attach" on the peer. + * But allow online shrinking if we are connected. */ new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0); - cur_size = drbd_get_capacity(device->this_bdev); if (new_size < cur_size && device->state.disk >= D_OUTDATED && - device->state.conn < C_CONNECTED) { + (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) { drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n", (unsigned long long)new_size, (unsigned long long)cur_size); conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); @@ -4046,8 +4187,8 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info synchronize_rcu(); kfree(old_disk_conf); - drbd_info(device, "Peer sets u_size to %lu sectors\n", - (unsigned long)my_usize); + drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", + (unsigned long)p_usize, (unsigned long)my_usize); } put_ldev(device); @@ -4080,9 +4221,36 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info * * However, if he sends a zero current size, * take his (user-capped or) backing disk size anyways. + * + * Unless of course he does not have a disk himself. + * In which case we ignore this completely. */ + sector_t new_size = p_csize ?: p_usize ?: p_size; drbd_reconsider_queue_parameters(device, NULL, o); - drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); + if (new_size == 0) { + /* Ignore, peer does not know nothing. */ + } else if (new_size == cur_size) { + /* nothing to do */ + } else if (cur_size != 0 && p_size == 0) { + drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n", + (unsigned long long)new_size, (unsigned long long)cur_size); + } else if (new_size < cur_size && device->state.role == R_PRIMARY) { + drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n", + (unsigned long long)new_size, (unsigned long long)cur_size); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } else { + /* I believe the peer, if + * - I don't have a current size myself + * - we agree on the size anyways + * - I do have a current size, am Secondary, + * and he has the only disk + * - I do have a current size, am Primary, + * and he has the only disk, + * which is larger than my current size + */ + drbd_set_my_capacity(device, new_size); + } } if (get_ldev(device)) { @@ -4142,7 +4310,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info kfree(device->p_uuid); device->p_uuid = p_uuid; - if (device->state.conn < C_CONNECTED && + if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) && device->state.disk < D_INCONSISTENT && device->state.role == R_PRIMARY && (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { @@ -4368,6 +4536,25 @@ static int receive_state(struct drbd_connection *connection, struct packet_info if (peer_state.conn == C_AHEAD) ns.conn = C_BEHIND; + /* TODO: + * if (primary and diskless and peer uuid != effective uuid) + * abort attach on peer; + * + * If this node does not have good data, was already connected, but + * the peer did a late attach only now, trying to "negotiate" with me, + * AND I am currently Primary, possibly frozen, with some specific + * "effective" uuid, this should never be reached, really, because + * we first send the uuids, then the current state. + * + * In this scenario, we already dropped the connection hard + * when we received the unsuitable uuids (receive_uuids(). + * + * Should we want to change this, that is: not drop the connection in + * receive_uuids() already, then we would need to add a branch here + * that aborts the attach of "unsuitable uuids" on the peer in case + * this node is currently Diskless Primary. + */ + if (device->p_uuid && peer_state.disk >= D_NEGOTIATING && get_ldev_if_state(device, D_NEGOTIATING)) { int cr; /* consider resync */ @@ -4380,7 +4567,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info (peer_state.disk == D_NEGOTIATING || os.disk == D_NEGOTIATING)); /* if we have both been inconsistent, and the peer has been - * forced to be UpToDate with --overwrite-data */ + * forced to be UpToDate with --force */ cr |= test_bit(CONSIDER_RESYNC, &device->flags); /* if we had been plain connected, and the admin requested to * start a sync by "invalidate" or "invalidate-remote" */ @@ -4845,7 +5032,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac peer_req->w.cb = e_end_resync_block; peer_req->submit_jif = jiffies; - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; spin_lock_irq(&device->resource->req_lock); list_add_tail(&peer_req->w.list, &device->sync_ee); @@ -4913,6 +5100,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, + [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data }, [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, }; @@ -5197,11 +5385,12 @@ static int drbd_do_features(struct drbd_connection *connection) drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); - drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", + drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n", connection->agreed_features, connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", - connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : + connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "", + connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" : connection->agreed_features ? "" : " none"); return 1; @@ -5284,7 +5473,7 @@ static int drbd_do_auth(struct drbd_connection *connection) if (pi.cmd != P_AUTH_CHALLENGE) { drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n", cmdname(pi.cmd), pi.cmd); - rv = 0; + rv = -1; goto fail; } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 1c4da17e902e..643a04af213b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -63,7 +63,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio drbd_req_make_private_bio(req, bio_src); req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) - | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0) + | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); req->device = device; req->master_bio = bio_src; @@ -1155,12 +1155,11 @@ static int drbd_process_write_request(struct drbd_request *req) return remote; } -static void drbd_process_discard_req(struct drbd_request *req) +static void drbd_process_discard_or_zeroes_req(struct drbd_request *req, int flags) { - struct block_device *bdev = req->device->ldev->backing_bdev; - - if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9, - GFP_NOIO, 0)) + int err = drbd_issue_discard_or_zero_out(req->device, + req->i.sector, req->i.size >> 9, flags); + if (err) req->private_bio->bi_status = BLK_STS_IOERR; bio_endio(req->private_bio); } @@ -1189,9 +1188,11 @@ drbd_submit_req_private_bio(struct drbd_request *req) if (get_ldev(device)) { if (drbd_insert_fault(device, type)) bio_io_error(bio); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES || - bio_op(bio) == REQ_OP_DISCARD) - drbd_process_discard_req(req); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES) + drbd_process_discard_or_zeroes_req(req, EE_ZEROOUT | + ((bio->bi_opf & REQ_NOUNMAP) ? 0 : EE_TRIM)); + else if (bio_op(bio) == REQ_OP_DISCARD) + drbd_process_discard_or_zeroes_req(req, EE_TRIM); else generic_make_request(bio); put_ldev(device); diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 94c654020f0f..c2f569d2661b 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -208,6 +208,7 @@ enum drbd_req_state_bits { __RQ_WRITE, __RQ_WSAME, __RQ_UNMAP, + __RQ_ZEROES, /* Should call drbd_al_complete_io() for this request... */ __RQ_IN_ACT_LOG, @@ -253,6 +254,7 @@ enum drbd_req_state_bits { #define RQ_WRITE (1UL << __RQ_WRITE) #define RQ_WSAME (1UL << __RQ_WSAME) #define RQ_UNMAP (1UL << __RQ_UNMAP) +#define RQ_ZEROES (1UL << __RQ_ZEROES) #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) #define RQ_UNPLUG (1UL << __RQ_UNPLUG) #define RQ_POSTPONED (1UL << __RQ_POSTPONED) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 0813c654c893..2b4c0db5d867 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -688,11 +688,9 @@ request_detach(struct drbd_device *device) CS_VERBOSE | CS_ORDERED | CS_INHIBIT_MD_IO); } -enum drbd_state_rv -drbd_request_detach_interruptible(struct drbd_device *device) +int drbd_request_detach_interruptible(struct drbd_device *device) { - enum drbd_state_rv rv; - int ret; + int ret, rv; drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ wait_event_interruptible(device->state_wait, @@ -1124,7 +1122,7 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st ns.pdsk = D_UP_TO_DATE; } - /* Implications of the connection stat on the disk states */ + /* Implications of the connection state on the disk states */ disk_min = D_DISKLESS; disk_max = D_UP_TO_DATE; pdsk_min = D_INCONSISTENT; @@ -2109,9 +2107,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) spin_unlock_irq(&connection->resource->req_lock); } } - kref_put(&connection->kref, drbd_destroy_connection); - conn_md_sync(connection); + kref_put(&connection->kref, drbd_destroy_connection); return 0; } diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index ea58301d0895..f87371e55e68 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -131,7 +131,7 @@ extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state enum chg_state_flags, struct completion *done); extern void print_st_err(struct drbd_device *, union drbd_state, - union drbd_state, int); + union drbd_state, enum drbd_state_rv); enum drbd_state_rv _conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, @@ -162,8 +162,7 @@ static inline int drbd_request_state(struct drbd_device *device, } /* for use in adm_detach() (drbd_adm_detach(), drbd_adm_down()) */ -enum drbd_state_rv -drbd_request_detach_interruptible(struct drbd_device *device); +int drbd_request_detach_interruptible(struct drbd_device *device); enum drbd_role conn_highest_role(struct drbd_connection *connection); enum drbd_role conn_highest_peer(struct drbd_connection *connection); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 99255d0c9e2f..268ef0c5d4ab 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -153,7 +153,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); /* FIXME do we want to detach for failed REQ_OP_DISCARD? - * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ + * ((peer_req->flags & (EE_WAS_ERROR|EE_TRIM)) == EE_WAS_ERROR) */ if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); |