diff options
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 23 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 92 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 28 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 485 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nla.c | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_protocol.h | 12 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 196 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 74 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 6 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 38 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 107 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_wrappers.h | 54 |
13 files changed, 701 insertions, 417 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 90ae4ba8f9ee..05a1780ffa85 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -29,7 +29,6 @@ #include <linux/drbd_limits.h> #include <linux/dynamic_debug.h> #include "drbd_int.h" -#include "drbd_wrappers.h" enum al_transaction_types { @@ -204,7 +203,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd BUG_ON(!bdev->md_bdev); - drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", + dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", (void*)_RET_IP_ ); @@ -276,7 +275,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval return _al_get(device, first, true); } -static bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, @@ -846,7 +844,7 @@ void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, int wake_up = 0; unsigned long flags; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -920,7 +918,7 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size if (size == 0) return 0; - if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; @@ -1023,8 +1021,7 @@ int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) unsigned int enr = BM_SECT_TO_EXT(sector); struct bm_extent *bm_ext; int i, sig; - int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. - 200 times -> 20 seconds. */ + bool sa; retry: sig = wait_event_interruptible(device->al_wait, @@ -1035,12 +1032,15 @@ retry: if (test_bit(BME_LOCKED, &bm_ext->flags)) return 0; + /* step aside only while we are above c-min-rate; unless disabled. */ + sa = drbd_rs_c_min_rate_throttle(device); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(device->al_wait, !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || - test_bit(BME_PRIORITY, &bm_ext->flags)); + (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); - if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { + if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { spin_lock_irq(&device->al_lock); if (lc_put(device->resync, &bm_ext->lce) == 0) { bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ @@ -1052,9 +1052,6 @@ retry: return -EINTR; if (schedule_timeout_interruptible(HZ/10)) return -EINTR; - if (sa && --sa == 0) - drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec." - "Resync stalled?\n"); goto retry; } } @@ -1288,7 +1285,7 @@ void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) sector_t esector, nr_sectors; int wake_up = 0; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e7093d4291f1..a76ceb344d64 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -382,6 +382,12 @@ enum { __EE_CALL_AL_COMPLETE_IO, __EE_MAY_SET_IN_SYNC, + /* is this a TRIM aka REQ_DISCARD? */ + __EE_IS_TRIM, + /* our lower level cannot handle trim, + * and we want to fall back to zeroout instead */ + __EE_IS_TRIM_USE_ZEROOUT, + /* In case a barrier failed, * we need to resubmit without the barrier flag. */ __EE_RESUBMITTED, @@ -405,7 +411,9 @@ enum { }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) -#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_IS_TRIM (1<<__EE_IS_TRIM) +#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) @@ -579,6 +587,7 @@ struct drbd_resource { struct list_head resources; struct res_opts res_opts; struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ + struct mutex adm_mutex; /* mutex to serialize administrative requests */ spinlock_t req_lock; unsigned susp:1; /* IO suspended by user */ @@ -609,6 +618,7 @@ struct drbd_connection { struct drbd_socket data; /* data/barrier/cstate/parameter packets */ struct drbd_socket meta; /* ping/ack (metadata) packets */ int agreed_pro_version; /* actually used protocol version */ + u32 agreed_features; unsigned long last_received; /* in jiffies, either socket */ unsigned int ko_count; @@ -814,6 +824,28 @@ struct drbd_device { struct submit_worker submit; }; +struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; +#define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_device *device; + struct drbd_resource *resource; + struct drbd_connection *connection; +}; + static inline struct drbd_device *minor_to_device(unsigned int minor) { return (struct drbd_device *)idr_find(&drbd_devices, minor); @@ -821,7 +853,7 @@ static inline struct drbd_device *minor_to_device(unsigned int minor) static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device) { - return list_first_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); + return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); } #define for_each_resource(resource, _resources) \ @@ -1139,6 +1171,12 @@ struct bm_extent { #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ +/* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ +#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE +#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) + extern int drbd_bm_init(struct drbd_device *device); extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); extern void drbd_bm_cleanup(struct drbd_device *device); @@ -1229,9 +1267,9 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); extern rwlock_t global_state_lock; extern int conn_lowest_minor(struct drbd_connection *connection); -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr); +extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); extern void drbd_destroy_device(struct kref *kref); -extern void drbd_delete_device(struct drbd_device *mdev); +extern void drbd_delete_device(struct drbd_device *device); extern struct drbd_resource *drbd_create_resource(const char *name); extern void drbd_free_resource(struct drbd_resource *resource); @@ -1257,7 +1295,7 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ -extern int drbd_msg_put_info(const char *info); +extern int drbd_msg_put_info(struct sk_buff *skb, const char *info); extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1283,6 +1321,10 @@ extern void conn_try_outdate_peer_async(struct drbd_connection *connection); extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ +/* bi_end_io handlers */ +extern void drbd_md_io_complete(struct bio *bio, int error); +extern void drbd_peer_request_endio(struct bio *bio, int error); +extern void drbd_request_endio(struct bio *bio, int error); extern int drbd_worker(struct drbd_thread *thi); enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); void drbd_resync_after_changed(struct drbd_device *device); @@ -1332,16 +1374,20 @@ extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); extern void start_resync_timer_fn(unsigned long data); +extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); + /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_asender(struct drbd_thread *thi); -extern int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); +extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); +extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); extern int drbd_submit_peer_request(struct drbd_device *, struct drbd_peer_request *, const unsigned, const int); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, + bool, gfp_t) __must_hold(local); extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, int); @@ -1401,6 +1447,37 @@ static inline void drbd_tcp_quickack(struct socket *sock) (char*)&val, sizeof(val)); } +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_device *device, + sector_t size) +{ + /* set_capacity(device->this_bdev->bd_disk, size); */ + set_capacity(device->vdisk, size); + device->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_device *device, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + printk(KERN_ERR "drbd%d: drbd_generic_make_request: " + "bio->bi_bdev == NULL\n", + device_to_minor(device)); + dump_stack(); + bio_endio(bio, -ENODEV); + return; + } + + if (drbd_insert_fault(device, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); /* drbd_proc.c */ @@ -1410,6 +1487,7 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ +extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); @@ -2144,7 +2222,7 @@ static inline void drbd_md_flush(struct drbd_device *device) static inline struct drbd_connection *first_connection(struct drbd_resource *resource) { - return list_first_entry(&resource->connections, + return list_first_entry_or_null(&resource->connections, struct drbd_connection, connections); } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 331e5cc1227d..960645c26e6f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1607,8 +1607,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long b return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; } -/* Used to send write requests - * R_PRIMARY -> Peer (P_DATA) +/* Used to send write or TRIM aka REQ_DISCARD requests + * R_PRIMARY -> Peer (P_DATA, P_TRIM) */ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req) { @@ -1640,6 +1640,16 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * dp_flags |= DP_SEND_WRITE_ACK; } p->dp_flags = cpu_to_be32(dp_flags); + + if (dp_flags & DP_DISCARD) { + struct p_trim *t = (struct p_trim*)p; + t->size = cpu_to_be32(req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); + goto out; + } + + /* our digest is still only over the payload. + * TRIM does not carry any payload. */ if (dgs) drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); @@ -1675,6 +1685,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * ... Be noisy about digest too large ... } */ } +out: mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ return err; @@ -2570,6 +2581,7 @@ struct drbd_resource *drbd_create_resource(const char *name) INIT_LIST_HEAD(&resource->connections); list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); + mutex_init(&resource->adm_mutex); spin_lock_init(&resource->req_lock); return resource; @@ -2687,14 +2699,16 @@ static int init_submitter(struct drbd_device *device) return 0; } -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr) +enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor) { + struct drbd_resource *resource = adm_ctx->resource; struct drbd_connection *connection; struct drbd_device *device; struct drbd_peer_device *peer_device, *tmp_peer_device; struct gendisk *disk; struct request_queue *q; int id; + int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; device = minor_to_device(minor); @@ -2763,7 +2777,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_no_minor_idr; } @@ -2773,7 +2787,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_idr_remove_minor; } @@ -2794,7 +2808,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_INVALID_REQUEST; - drbd_msg_put_info("requested volume exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already"); } goto out_idr_remove_from_resource; } @@ -2803,7 +2817,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (init_submitter(device)) { err = ERR_NOMEM; - drbd_msg_put_info("unable to create submit workqueue"); + drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue"); goto out_idr_remove_vol; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 526414bc2cab..1b35c45c92b7 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -34,7 +34,6 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" -#include "drbd_wrappers.h" #include <asm/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -82,32 +81,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; -/* Configuration is strictly serialized, because generic netlink message - * processing is strictly serialized by the genl_lock(). - * Which means we can use one static global drbd_config_context struct. - */ -static struct drbd_config_context { - /* assigned from drbd_genlmsghdr */ - unsigned int minor; - /* assigned from request attributes, if present */ - unsigned int volume; -#define VOLUME_UNSPECIFIED (-1U) - /* pointer into the request skb, - * limited lifetime! */ - char *resource_name; - struct nlattr *my_addr; - struct nlattr *peer_addr; - - /* reply buffer */ - struct sk_buff *reply_skb; - /* pointer into reply buffer */ - struct drbd_genlmsghdr *reply_dh; - /* resolved from attributes, if possible */ - struct drbd_device *device; - struct drbd_resource *resource; - struct drbd_connection *connection; -} adm_ctx; - static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) { genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); @@ -117,9 +90,8 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only * reason it could fail was no space in skb, and there are 4k available. */ -int drbd_msg_put_info(const char *info) +int drbd_msg_put_info(struct sk_buff *skb, const char *info) { - struct sk_buff *skb = adm_ctx.reply_skb; struct nlattr *nla; int err = -EMSGSIZE; @@ -143,42 +115,46 @@ int drbd_msg_put_info(const char *info) * and per-family private info->pointers. * But we need to stay compatible with older kernels. * If it returns successfully, adm_ctx members are valid. + * + * At this point, we still rely on the global genl_lock(). + * If we want to avoid that, and allow "genl_family.parallel_ops", we may need + * to add additional synchronization against object destruction/modification. */ #define DRBD_ADM_NEED_MINOR 1 #define DRBD_ADM_NEED_RESOURCE 2 #define DRBD_ADM_NEED_CONNECTION 4 -static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, - unsigned flags) +static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, + struct sk_buff *skb, struct genl_info *info, unsigned flags) { struct drbd_genlmsghdr *d_in = info->userhdr; const u8 cmd = info->genlhdr->cmd; int err; - memset(&adm_ctx, 0, sizeof(adm_ctx)); + memset(adm_ctx, 0, sizeof(*adm_ctx)); /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) return -EPERM; - adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!adm_ctx.reply_skb) { + adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx->reply_skb) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, + adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb, info, &drbd_genl_family, 0, cmd); /* put of a few bytes into a fresh skb of >= 4k will always succeed. * but anyways */ - if (!adm_ctx.reply_dh) { + if (!adm_ctx->reply_dh) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh->minor = d_in->minor; - adm_ctx.reply_dh->ret_code = NO_ERROR; + adm_ctx->reply_dh->minor = d_in->minor; + adm_ctx->reply_dh->ret_code = NO_ERROR; - adm_ctx.volume = VOLUME_UNSPECIFIED; + adm_ctx->volume = VOLUME_UNSPECIFIED; if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { struct nlattr *nla; /* parse and validate only */ @@ -188,111 +164,131 @@ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, /* It was present, and valid, * copy it over to the reply skb. */ - err = nla_put_nohdr(adm_ctx.reply_skb, + err = nla_put_nohdr(adm_ctx->reply_skb, info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, info->attrs[DRBD_NLA_CFG_CONTEXT]); if (err) goto fail; - /* and assign stuff to the global adm_ctx */ + /* and assign stuff to the adm_ctx */ nla = nested_attr_tb[__nla_type(T_ctx_volume)]; if (nla) - adm_ctx.volume = nla_get_u32(nla); + adm_ctx->volume = nla_get_u32(nla); nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; if (nla) - adm_ctx.resource_name = nla_data(nla); - adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; - adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; - if ((adm_ctx.my_addr && - nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) || - (adm_ctx.peer_addr && - nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) { + adm_ctx->resource_name = nla_data(nla); + adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx->my_addr && + nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || + (adm_ctx->peer_addr && + nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) { err = -EINVAL; goto fail; } } - adm_ctx.minor = d_in->minor; - adm_ctx.device = minor_to_device(d_in->minor); - if (adm_ctx.resource_name) { - adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name); + adm_ctx->minor = d_in->minor; + adm_ctx->device = minor_to_device(d_in->minor); + + /* We are protected by the global genl_lock(). + * But we may explicitly drop it/retake it in drbd_adm_set_role(), + * so make sure this object stays around. */ + if (adm_ctx->device) + kref_get(&adm_ctx->device->kref); + + if (adm_ctx->resource_name) { + adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name); } - if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) { - drbd_msg_put_info("unknown minor"); + if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor"); return ERR_MINOR_INVALID; } - if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) { - drbd_msg_put_info("unknown resource"); - if (adm_ctx.resource_name) + if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource"); + if (adm_ctx->resource_name) return ERR_RES_NOT_KNOWN; return ERR_INVALID_REQUEST; } if (flags & DRBD_ADM_NEED_CONNECTION) { - if (adm_ctx.resource) { - drbd_msg_put_info("no resource name expected"); + if (adm_ctx->resource) { + drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device) { - drbd_msg_put_info("no minor number expected"); + if (adm_ctx->device) { + drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.my_addr && adm_ctx.peer_addr) - adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr), - nla_len(adm_ctx.my_addr), - nla_data(adm_ctx.peer_addr), - nla_len(adm_ctx.peer_addr)); - if (!adm_ctx.connection) { - drbd_msg_put_info("unknown connection"); + if (adm_ctx->my_addr && adm_ctx->peer_addr) + adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr), + nla_len(adm_ctx->my_addr), + nla_data(adm_ctx->peer_addr), + nla_len(adm_ctx->peer_addr)); + if (!adm_ctx->connection) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection"); return ERR_INVALID_REQUEST; } } /* some more paranoia, if the request was over-determined */ - if (adm_ctx.device && adm_ctx.resource && - adm_ctx.device->resource != adm_ctx.resource) { + if (adm_ctx->device && adm_ctx->resource && + adm_ctx->device->resource != adm_ctx->resource) { pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", - adm_ctx.minor, adm_ctx.resource->name, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists in different resource"); + adm_ctx->minor, adm_ctx->resource->name, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device && - adm_ctx.volume != VOLUME_UNSPECIFIED && - adm_ctx.volume != adm_ctx.device->vnr) { + if (adm_ctx->device && + adm_ctx->volume != VOLUME_UNSPECIFIED && + adm_ctx->volume != adm_ctx->device->vnr) { pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", - adm_ctx.minor, adm_ctx.volume, - adm_ctx.device->vnr, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists as different volume"); + adm_ctx->minor, adm_ctx->volume, + adm_ctx->device->vnr, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); return ERR_INVALID_REQUEST; } + /* still, provide adm_ctx->resource always, if possible. */ + if (!adm_ctx->resource) { + adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource + : adm_ctx->connection ? adm_ctx->connection->resource : NULL; + if (adm_ctx->resource) + kref_get(&adm_ctx->resource->kref); + } + return NO_ERROR; fail: - nlmsg_free(adm_ctx.reply_skb); - adm_ctx.reply_skb = NULL; + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } -static int drbd_adm_finish(struct genl_info *info, int retcode) +static int drbd_adm_finish(struct drbd_config_context *adm_ctx, + struct genl_info *info, int retcode) { - if (adm_ctx.connection) { - kref_put(&adm_ctx.connection->kref, drbd_destroy_connection); - adm_ctx.connection = NULL; + if (adm_ctx->device) { + kref_put(&adm_ctx->device->kref, drbd_destroy_device); + adm_ctx->device = NULL; } - if (adm_ctx.resource) { - kref_put(&adm_ctx.resource->kref, drbd_destroy_resource); - adm_ctx.resource = NULL; + if (adm_ctx->connection) { + kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection); + adm_ctx->connection = NULL; + } + if (adm_ctx->resource) { + kref_put(&adm_ctx->resource->kref, drbd_destroy_resource); + adm_ctx->resource = NULL; } - if (!adm_ctx.reply_skb) + if (!adm_ctx->reply_skb) return -ENOMEM; - adm_ctx.reply_dh->ret_code = retcode; - drbd_adm_send_reply(adm_ctx.reply_skb, info); + adm_ctx->reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx->reply_skb, info); return 0; } @@ -426,6 +422,14 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec } rcu_read_unlock(); + if (fp == FP_NOT_AVAIL) { + /* IO Suspending works on the whole resource. + Do it only for one device. */ + vnr = 0; + peer_device = idr_get_next(&connection->peer_devices, &vnr); + drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); + } + return fp; } @@ -438,12 +442,13 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) char *ex_to_string; int r; + spin_lock_irq(&connection->resource->req_lock); if (connection->cstate >= C_WF_REPORT_PARAMS) { drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); + spin_unlock_irq(&connection->resource->req_lock); return false; } - spin_lock_irq(&connection->resource->req_lock); connect_cnt = connection->connect_cnt; spin_unlock_irq(&connection->resource->req_lock); @@ -654,11 +659,11 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) put_ldev(device); } } else { - mutex_lock(&device->resource->conf_update); + /* Called from drbd_adm_set_role only. + * We are still holding the conf_update mutex. */ nc = first_peer_device(device)->connection->net_conf; if (nc) nc->discard_my_data = 0; /* without copy; single bit op is atomic */ - mutex_unlock(&device->resource->conf_update); set_disk_ro(device->vdisk, false); if (get_ldev(device)) { @@ -700,11 +705,12 @@ static const char *from_attrs_err_to_txt(int err) int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct set_role_parms parms; int err; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -715,17 +721,22 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) err = set_role_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + genl_unlock(); + mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); else retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + + mutex_unlock(&adm_ctx.resource->adm_mutex); + genl_lock(); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1104,15 +1115,18 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ struct request_queue * const q = device->rq_queue; unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; + struct request_queue *b = NULL; if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + b = device->ldev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; rcu_read_unlock(); - put_ldev(device); + + blk_set_stacking_limits(&q->limits); + blk_queue_max_write_same_sectors(q, 0); } blk_queue_logical_block_size(q, 512); @@ -1121,8 +1135,25 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); - if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + if (b) { + struct drbd_connection *connection = first_peer_device(device)->connection; + + if (blk_queue_discard(b) && + (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { + /* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ + q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; + + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + /* REALLY? Is stacking secdiscard "legal"? */ + if (blk_queue_secdiscard(b)) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + } else { + q->limits.max_discard_sectors = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + } blk_queue_stack_limits(q, b); @@ -1164,8 +1195,14 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device) peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ else peer = DRBD_MAX_BIO_SIZE; - } + /* We may later detach and re-attach on a disconnected Primary. + * Avoid this setting to jump back in that case. + * We want to store what we know the peer DRBD can handle, + * not what the peer IO backend can handle. */ + if (peer > device->peer_max_bio_size) + device->peer_max_bio_size = peer; + } new = min(local, peer); if (device->state.role == R_PRIMARY && new < now) @@ -1258,19 +1295,21 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_device *device; struct disk_conf *new_disk_conf, *old_disk_conf; struct fifo_buffer *old_plan = NULL, *new_plan = NULL; int err, fifo_size; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); /* we also need a disk * to change the options on */ @@ -1294,7 +1333,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs_for_change(new_disk_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_unlock; } @@ -1385,12 +1424,15 @@ fail_unlock: success: put_ldev(device); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int err; enum drbd_ret_code retcode; @@ -1406,13 +1448,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) enum drbd_state_rv rv; struct net_conf *nc; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); conn_reconfig_start(first_peer_device(device)->connection); /* if you want to reconfigure, please tear down first */ @@ -1455,7 +1498,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs(new_disk_conf, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -1619,7 +1662,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } if (device->state.conn < C_CONNECTED && - device->state.role == R_PRIMARY && + device->state.role == R_PRIMARY && device->ed_uuid && (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { drbd_err(device, "Can only attach to data with current UUID=%016llX\n", (unsigned long long)device->ed_uuid); @@ -1797,7 +1840,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); put_ldev(device); conn_reconfig_done(first_peer_device(device)->connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; force_diskless_dec: @@ -1819,9 +1863,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kfree(new_disk_conf); lc_destroy(resync_lru); kfree(new_plan); - + mutex_unlock(&adm_ctx.resource->adm_mutex); finish: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1860,11 +1904,12 @@ out: * Only then we have finally detached. */ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct detach_parms parms = { }; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -1874,14 +1919,16 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) err = detach_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_detach(adm_ctx.device, parms.force_detach); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2055,6 +2102,7 @@ static void free_crypto(struct crypto *crypto) int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_connection *connection; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2063,13 +2111,14 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) int rsr; /* re-sync running */ struct crypto crypto = { }; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; connection = adm_ctx.connection; + mutex_lock(&adm_ctx.resource->adm_mutex); new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); if (!new_net_conf) { @@ -2084,7 +2133,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) old_net_conf = connection->net_conf; if (!old_net_conf) { - drbd_msg_put_info("net conf missing, try connect"); + drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); retcode = ERR_INVALID_REQUEST; goto fail; } @@ -2096,7 +2145,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs_for_change(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2167,12 +2216,15 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) done: conn_reconfig_done(connection); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; struct crypto crypto = { }; @@ -2182,14 +2234,14 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) int i; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { - drbd_msg_put_info("connection endpoint(s) missing"); + drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -2215,6 +2267,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } } + mutex_lock(&adm_ctx.resource->adm_mutex); connection = first_connection(adm_ctx.resource); conn_reconfig_start(connection); @@ -2235,7 +2288,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2284,7 +2337,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail: @@ -2292,8 +2346,9 @@ fail: kfree(new_net_conf); conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2356,13 +2411,14 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disconnect_parms parms; struct drbd_connection *connection; enum drbd_state_rv rv; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2374,18 +2430,20 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) err = disconnect_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } } + mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); if (rv < SS_SUCCESS) retcode = rv; /* FIXME: Type mismatch. */ else retcode = NO_ERROR; + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2407,6 +2465,7 @@ void resync_after_online_grow(struct drbd_device *device) int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disk_conf *old_disk_conf, *new_disk_conf = NULL; struct resize_parms rs; struct drbd_device *device; @@ -2417,12 +2476,13 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) sector_t u_size; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto fail; + goto finish; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; @@ -2436,7 +2496,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) err = resize_parms_from_attrs(&rs, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_ldev; } } @@ -2482,7 +2542,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) goto fail_ldev; } - if (device->state.conn != C_CONNECTED) { + if (device->state.conn != C_CONNECTED && !rs.resize_force) { retcode = ERR_MD_LAYOUT_CONNECTED; goto fail_ldev; } @@ -2528,7 +2588,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } fail: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail_ldev: @@ -2538,11 +2600,12 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2555,33 +2618,37 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } + mutex_lock(&adm_ctx.resource->adm_mutex); err = set_resource_options(adm_ctx.resource, &res_opts); if (err) { retcode = ERR_INVALID_REQUEST; if (err == -ENOMEM) retcode = ERR_NOMEM; } + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2605,26 +2672,29 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, union drbd_state mask, union drbd_state val) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = drbd_request_state(adm_ctx.device, mask, val); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2639,15 +2709,17 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device) int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; int retcode; /* drbd_ret_code, drbd_state_rv */ struct drbd_device *device; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2674,40 +2746,45 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) retcode = ERR_PAUSE_IS_SET; + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; union drbd_dev_state s; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { s = adm_ctx.device->state; if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { @@ -2717,9 +2794,9 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) retcode = ERR_PAUSE_IS_CLEAR; } } - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2730,15 +2807,17 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (test_bit(NEW_CUR_UUID, &device->flags)) { drbd_uuid_new_current(device); @@ -2753,9 +2832,9 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); } drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2931,10 +3010,11 @@ nla_put_failure: int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2946,7 +3026,7 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3133,11 +3213,12 @@ dump: int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct timeout_parms tp; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3154,17 +3235,18 @@ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; struct start_ov_parms parms; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3179,10 +3261,12 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) int err = start_ov_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); + /* w_make_ov_request expects position to be aligned */ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); device->ov_stop_sector = parms.ov_stop_sector; @@ -3193,21 +3277,24 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); drbd_resume_io(device); + + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; int skip_initial_sync = 0; int err; struct new_c_uuid_parms args; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3219,11 +3306,12 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) err = new_c_uuid_parms_from_attrs(&args, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out_nolock; } } + mutex_lock(&adm_ctx.resource->adm_mutex); mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(device)) { @@ -3268,22 +3356,24 @@ out_dec: put_ldev(device); out: mutex_unlock(device->state_mutex); + mutex_unlock(&adm_ctx.resource->adm_mutex); out_nolock: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static enum drbd_ret_code -drbd_check_resource_name(const char *name) +drbd_check_resource_name(struct drbd_config_context *adm_ctx) { + const char *name = adm_ctx->resource_name; if (!name || !name[0]) { - drbd_msg_put_info("resource name missing"); + drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing"); return ERR_MANDATORY_TAG; } /* if we want to use these in sysfs/configfs/debugfs some day, * we must not allow slashes */ if (strchr(name, '/')) { - drbd_msg_put_info("invalid resource name"); + drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name"); return ERR_INVALID_REQUEST; } return NO_ERROR; @@ -3291,11 +3381,12 @@ drbd_check_resource_name(const char *name) int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, 0); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3305,48 +3396,50 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } - retcode = drbd_check_resource_name(adm_ctx.resource_name); + retcode = drbd_check_resource_name(&adm_ctx); if (retcode != NO_ERROR) goto out; if (adm_ctx.resource) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { retcode = ERR_INVALID_REQUEST; - drbd_msg_put_info("resource exists"); + drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); } /* else: still NO_ERROR */ goto out; } + /* not yet safe for genl_family.parallel_ops */ if (!conn_create(adm_ctx.resource_name, &res_opts)) retcode = ERR_NOMEM; out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_genlmsghdr *dh = info->userhdr; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (dh->minor > MINORMASK) { - drbd_msg_put_info("requested minor out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); retcode = ERR_INVALID_REQUEST; goto out; } if (adm_ctx.volume > DRBD_VOLUME_MAX) { - drbd_msg_put_info("requested volume id out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -3360,9 +3453,11 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) goto out; } - retcode = drbd_create_device(adm_ctx.resource, dh->minor, adm_ctx.volume); + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3383,35 +3478,40 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device) int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_del_minor(adm_ctx.device); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ unsigned i; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); /* demote */ for_each_connection(connection, resource) { struct drbd_peer_device *peer_device; @@ -3419,14 +3519,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&connection->peer_devices, peer_device, i) { retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to demote"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); goto out; } } retcode = conn_try_disconnect(connection, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to disconnect"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); goto out; } } @@ -3435,7 +3535,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&resource->devices, device, i) { retcode = adm_detach(device, 0); if (retcode < SS_SUCCESS || retcode > NO_ERROR) { - drbd_msg_put_info("failed to detach"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); goto out; } } @@ -3453,7 +3553,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) retcode = adm_del_minor(device); if (retcode != NO_ERROR) { /* "can not happen" */ - drbd_msg_put_info("failed to delete volume"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); goto out; } } @@ -3462,25 +3562,28 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) synchronize_rcu(); drbd_free_resource(resource); retcode = NO_ERROR; - out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); for_each_connection(connection, resource) { if (connection->cstate > C_STANDALONE) { retcode = ERR_NET_CONFIGURED; @@ -3499,7 +3602,9 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) drbd_free_resource(resource); retcode = NO_ERROR; out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c index fa672b6df8d6..b2d4791498a6 100644 --- a/drivers/block/drbd/drbd_nla.c +++ b/drivers/block/drbd/drbd_nla.c @@ -1,4 +1,3 @@ -#include "drbd_wrappers.h" #include <linux/kernel.h> #include <net/netlink.h> #include <linux/drbd_genl_api.h> diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2f26e8ffa45b..89736bdbbc70 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -116,7 +116,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se /* ------------------------ ~18s average ------------------------ */ i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS; dt = (jiffies - device->rs_mark_time[i]) / HZ; - if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) + if (dt > 180) stalled = 1; if (!dt) diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 3c04ec0ea333..2da9104a3851 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -54,6 +54,11 @@ enum drbd_packet { P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ + /* 0x2e to 0x30 reserved, used in drbd 9 */ + + /* REQ_DISCARD. We used "discard" in different contexts before, + * which is why I chose TRIM here, to disambiguate. */ + P_TRIM = 0x31, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -119,6 +124,11 @@ struct p_data { u32 dp_flags; } __packed; +struct p_trim { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + /* * commands which share a struct: * p_block_ack: @@ -150,6 +160,8 @@ struct p_block_req { * ReportParams */ +#define FF_TRIM 1 + struct p_connection_features { u32 protocol_min; u32 feature_flags; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 68e3992e8838..b6c8aaf4931b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -46,9 +46,10 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" - #include "drbd_vli.h" +#define PRO_FEATURES (FF_TRIM) + struct packet_info { enum drbd_packet cmd; unsigned int size; @@ -65,7 +66,7 @@ enum finish_epoch { static int drbd_do_features(struct drbd_connection *connection); static int drbd_do_auth(struct drbd_connection *connection); static int drbd_disconnected(struct drbd_peer_device *); - +static void conn_wait_active_ee_empty(struct drbd_connection *connection); static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); static int e_end_block(struct drbd_work *, int); @@ -234,9 +235,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) * @retry: whether to retry, if not enough pages are available right now * * Tries to allocate number pages, first from our own page pool, then from - * the kernel, unless this allocation would exceed the max_buffers setting. + * the kernel. * Possibly retry until DRBD frees sufficient pages somewhere else. * + * If this allocation would exceed the max_buffers setting, we throttle + * allocation (schedule_timeout) to give the system some room to breathe. + * + * We do not use max-buffers as hard limit, because it could lead to + * congestion and further to a distributed deadlock during online-verify or + * (checksum based) resync, if the max-buffers, socket buffer sizes and + * resync-rate settings are mis-configured. + * * Returns a page chain linked via page->private. */ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, @@ -246,10 +255,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int struct page *page = NULL; struct net_conf *nc; DEFINE_WAIT(wait); - int mxb; + unsigned int mxb; - /* Yes, we may run up to @number over max_buffers. If we - * follow it strictly, the admin will get it wrong anyways. */ rcu_read_lock(); nc = rcu_dereference(peer_device->connection->net_conf); mxb = nc ? nc->max_buffers : 1000000; @@ -277,7 +284,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int break; } - schedule(); + if (schedule_timeout(HZ/10) == 0) + mxb = UINT_MAX; } finish_wait(&drbd_pp_wait, &wait); @@ -331,7 +339,7 @@ You must not have the req_lock: struct drbd_peer_request * drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - unsigned int data_size, gfp_t gfp_mask) __must_hold(local) + unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; @@ -348,7 +356,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } - if (data_size) { + if (has_payload && data_size) { page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); if (!page) goto fail; @@ -1026,24 +1034,27 @@ randomize: if (drbd_send_protocol(connection) == -EOPNOTSUPP) return -1; + /* Prevent a race between resync-handshake and + * being promoted to Primary. + * + * Grab and release the state mutex, so we know that any current + * drbd_set_role() is finished, and any incoming drbd_set_role + * will see the STATE_SENT flag, and wait for it to be cleared. + */ + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_lock(peer_device->device->state_mutex); + set_bit(STATE_SENT, &connection->flags); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_unlock(peer_device->device->state_mutex); + rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; kref_get(&device->kref); rcu_read_unlock(); - /* Prevent a race between resync-handshake and - * being promoted to Primary. - * - * Grab and release the state mutex, so we know that any current - * drbd_set_role() is finished, and any incoming drbd_set_role - * will see the STATE_SENT flag, and wait for it to be cleared. - */ - mutex_lock(device->state_mutex); - mutex_unlock(device->state_mutex); - if (discard_my_data) set_bit(DISCARD_MY_DATA, &device->flags); else @@ -1315,6 +1326,20 @@ int drbd_submit_peer_request(struct drbd_device *device, unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; + if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* wait for all pending IO completions, before we start + * zeroing things out. */ + conn_wait_active_ee_empty(first_peer_device(device)->connection); + if (blkdev_issue_zeroout(device->ldev->backing_bdev, + sector, ds >> 9, GFP_NOIO)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); + return 0; + } + + if (peer_req->flags & EE_IS_TRIM) + nr_pages = 0; /* discards don't have any payload. */ + /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the @@ -1326,7 +1351,7 @@ int drbd_submit_peer_request(struct drbd_device *device, next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { - drbd_err(device, "submit_ee: Allocation of a bio failed\n"); + drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); goto fail; } /* > peer_req->i.sector, unless this is the first bio */ @@ -1340,6 +1365,11 @@ next_bio: bios = bio; ++n_bios; + if (rw & REQ_DISCARD) { + bio->bi_iter.bi_size = ds; + goto submit; + } + page_chain_for_each(page) { unsigned len = min_t(unsigned, ds, PAGE_SIZE); if (!bio_add_page(bio, page, len, 0)) { @@ -1360,8 +1390,9 @@ next_bio: sector += len >> 9; --nr_pages; } - D_ASSERT(device, page == NULL); D_ASSERT(device, ds == 0); +submit: + D_ASSERT(device, page == NULL); atomic_set(&peer_req->pending_bios, n_bios); do { @@ -1490,19 +1521,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * and from receive_Data */ static struct drbd_peer_request * read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - int data_size) __must_hold(local) + struct packet_info *pi) __must_hold(local) { struct drbd_device *device = peer_device->device; const sector_t capacity = drbd_get_capacity(device->this_bdev); struct drbd_peer_request *peer_req; struct page *page; int dgs, ds, err; + int data_size = pi->size; void *dig_in = peer_device->connection->int_dig_in; void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; + struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; dgs = 0; - if (peer_device->connection->peer_integrity_tfm) { + if (!trim && peer_device->connection->peer_integrity_tfm) { dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); /* * FIXME: Receive the incoming digest into the receive buffer @@ -1514,9 +1547,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= dgs; } + if (trim) { + D_ASSERT(peer_device, data_size == 0); + data_size = be32_to_cpu(trim->size); + } + if (!expect(IS_ALIGNED(data_size, 512))) return NULL; - if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) + /* prepare for larger trim requests. */ + if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, @@ -1532,11 +1571,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); if (!peer_req) return NULL; - if (!data_size) + if (trim) return peer_req; ds = data_size; @@ -1676,12 +1715,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused) } static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, - int data_size) __releases(local) + struct packet_info *pi) __releases(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; - peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size); + peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); if (!peer_req) goto fail; @@ -1697,7 +1736,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto list_add(&peer_req->w.list, &device->sync_ee); spin_unlock_irq(&device->resource->req_lock); - atomic_add(data_size >> 9, &device->rs_sect_ev); + atomic_add(pi->size >> 9, &device->rs_sect_ev); if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) return 0; @@ -1785,7 +1824,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet /* data is submitted to disk within recv_resync_read. * corresponding put_ldev done below on error, * or in drbd_peer_request_endio. */ - err = recv_resync_read(peer_device, sector, pi->size); + err = recv_resync_read(peer_device, sector, pi); } else { if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "Can not write resync data to local disk.\n"); @@ -2196,7 +2235,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * */ sector = be64_to_cpu(p->sector); - peer_req = read_in_block(peer_device, p->block_id, sector, pi->size); + peer_req = read_in_block(peer_device, p->block_id, sector, pi); if (!peer_req) { put_ldev(device); return -EIO; @@ -2206,7 +2245,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(dp_flags); - if (peer_req->pages == NULL) { + if (pi->cmd == P_TRIM) { + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + peer_req->flags |= EE_IS_TRIM; + if (!blk_queue_discard(q)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, rw & REQ_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); } @@ -2242,7 +2289,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - list_add(&peer_req->w.list, &device->active_ee); + /* if we use the zeroout fallback code, we process synchronously + * and we wait for all pending requests, respectively wait for + * active_ee to become empty in drbd_submit_peer_request(); + * better not add ourselves here. */ + if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + list_add(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (device->state.conn == C_SYNC_TARGET) @@ -2313,39 +2365,45 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) +bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) { - struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; - unsigned long db, dt, dbdt; struct lc_element *tmp; - int curr_events; - int throttle = 0; - unsigned int c_min_rate; - - rcu_read_lock(); - c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; - rcu_read_unlock(); + bool throttle = true; - /* feature disabled? */ - if (c_min_rate == 0) - return 0; + if (!drbd_rs_c_min_rate_throttle(device)) + return false; spin_lock_irq(&device->al_lock); tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); if (tmp) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); - if (test_bit(BME_PRIORITY, &bm_ext->flags)) { - spin_unlock_irq(&device->al_lock); - return 0; - } + if (test_bit(BME_PRIORITY, &bm_ext->flags)) + throttle = false; /* Do not slow down if app IO is already waiting for this extent */ } spin_unlock_irq(&device->al_lock); + return throttle; +} + +bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + unsigned int c_min_rate; + int curr_events; + + rcu_read_lock(); + c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; + rcu_read_unlock(); + + /* feature disabled? */ + if (c_min_rate == 0) + return false; + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + (int)part_stat_read(&disk->part0, sectors[1]) - atomic_read(&device->rs_sect_ev); - if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { unsigned long rs_left; int i; @@ -2368,12 +2426,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) dbdt = Bit2KB(db/dt); if (dbdt > c_min_rate) - throttle = 1; + return true; } - return throttle; + return false; } - static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) { struct drbd_peer_device *peer_device; @@ -2436,7 +2493,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, + true /* has real payload */, GFP_NOIO); if (!peer_req) { put_ldev(device); return -ENOMEM; @@ -3648,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info put_ldev(device); } + device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + drbd_reconsider_max_bio_size(device); + /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + In case we cleared the QUEUE_FLAG_DISCARD from our queue in + drbd_reconsider_max_bio_size(), we can be sure that after + drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ + ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(device)) { dd = drbd_determine_dev_size(device, ddsf, NULL); @@ -3660,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info drbd_set_my_capacity(device, p_size); } - device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); - drbd_reconsider_max_bio_size(device); - if (get_ldev(device)) { if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); @@ -4423,6 +4485,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, }; static void drbdd(struct drbd_connection *connection) @@ -4630,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection) memset(p, 0, sizeof(*p)); p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + p->feature_flags = cpu_to_be32(PRO_FEATURES); return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); } @@ -4683,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection) goto incompat; connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); + drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", + connection->agreed_features & FF_TRIM ? " " : " not "); + return 1; incompat: @@ -4778,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (pi.size < CHALLENGE_LEN) { + drbd_err(connection, "AuthChallenge payload too small.\n"); + rv = -1; + goto fail; + } + peers_ch = kmalloc(pi.size, GFP_NOIO); if (peers_ch == NULL) { drbd_err(connection, "kmalloc of peers_ch failed\n"); @@ -4791,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { + drbd_err(connection, "Peer presented the same challenge!\n"); + rv = -1; + goto fail; + } + resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm); response = kmalloc(resp_size, GFP_NOIO); if (response == NULL) { diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3779c8d2875b..09803d0d5207 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -522,6 +522,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); break; + case DISCARD_COMPLETED_NOTSUPP: + case DISCARD_COMPLETED_WITH_ERROR: + /* I'd rather not detach from local disk just because it + * failed a REQ_DISCARD. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + case QUEUE_FOR_NET_READ: /* READ or READA, and * no local disk, @@ -1235,6 +1242,7 @@ void do_submit(struct work_struct *ws) if (list_empty(&incoming)) break; +skip_fast_path: wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); /* Maybe more was queued, while we prepared the transaction? * Try to stuff them into this transaction as well. @@ -1273,6 +1281,25 @@ void do_submit(struct work_struct *ws) list_del_init(&req->tl_requests); drbd_send_and_submit(device, req); } + + /* If all currently hot activity log extents are kept busy by + * incoming requests, we still must not totally starve new + * requests to cold extents. In that case, prepare one request + * in blocking mode. */ + list_for_each_entry_safe(req, tmp, &incoming, tl_requests) { + list_del_init(&req->tl_requests); + req->rq_state |= RQ_IN_ACT_LOG; + if (!drbd_al_begin_io_prepare(device, &req->i)) { + /* Corresponding extent was hot after all? */ + drbd_send_and_submit(device, req); + } else { + /* Found a request to a cold extent. + * Put on "pending" list, + * and try to cumulate with more. */ + list_add(&req->tl_requests, &pending); + goto skip_fast_path; + } + } } } @@ -1326,23 +1353,35 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct return limit; } -static struct drbd_request *find_oldest_request(struct drbd_connection *connection) +static void find_oldest_requests( + struct drbd_connection *connection, + struct drbd_device *device, + struct drbd_request **oldest_req_waiting_for_peer, + struct drbd_request **oldest_req_waiting_for_disk) { - /* Walk the transfer log, - * and find the oldest not yet completed request */ struct drbd_request *r; + *oldest_req_waiting_for_peer = NULL; + *oldest_req_waiting_for_disk = NULL; list_for_each_entry(r, &connection->transfer_log, tl_requests) { - if (atomic_read(&r->completion_ref)) - return r; + const unsigned s = r->rq_state; + if (!*oldest_req_waiting_for_peer + && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) + *oldest_req_waiting_for_peer = r; + + if (!*oldest_req_waiting_for_disk + && (s & RQ_LOCAL_PENDING) && r->device == device) + *oldest_req_waiting_for_disk = r; + + if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk) + break; } - return NULL; } void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; struct drbd_connection *connection = first_peer_device(device)->connection; - struct drbd_request *req; /* oldest request */ + struct drbd_request *req_disk, *req_peer; /* oldest request */ struct net_conf *nc; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; @@ -1366,8 +1405,8 @@ void request_timer_fn(unsigned long data) now = jiffies; spin_lock_irq(&device->resource->req_lock); - req = find_oldest_request(connection); - if (!req) { + find_oldest_requests(connection, device, &req_peer, &req_disk); + if (req_peer == NULL && req_disk == NULL) { spin_unlock_irq(&device->resource->req_lock); mod_timer(&device->request_timer, now + et); return; @@ -1389,19 +1428,26 @@ void request_timer_fn(unsigned long data) * ~198 days with 250 HZ, we have a window where the timeout would need * to expire twice (worst case) to become effective. Good enough. */ - if (ent && req->rq_state & RQ_NET_PENDING && - time_after(now, req->start_time + ent) && + if (ent && req_peer && + time_after(now, req_peer->start_time + ent) && !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - if (dt && req->rq_state & RQ_LOCAL_PENDING && req->device == device && - time_after(now, req->start_time + dt) && + if (dt && req_disk && + time_after(now, req_disk->start_time + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); __drbd_chk_io_error(device, DRBD_FORCE_DETACH); } - nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; + + /* Reschedule timer for the nearest not already expired timeout. + * Fallback to now + min(effective network timeout, disk timeout). */ + ent = (ent && req_peer && time_before(now, req_peer->start_time + ent)) + ? req_peer->start_time + ent : now + et; + dt = (dt && req_disk && time_before(now, req_disk->start_time + dt)) + ? req_disk->start_time + dt : now + et; + nt = time_before(ent, dt) ? ent : dt; spin_unlock_irq(&connection->resource->req_lock); mod_timer(&device->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c684c963538e..8566cd5866b4 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -30,7 +30,6 @@ #include <linux/slab.h> #include <linux/drbd.h> #include "drbd_int.h" -#include "drbd_wrappers.h" /* The request callbacks will be called in irq context by the IDE drivers, and in Softirqs/Tasklets/BH context by the SCSI drivers, @@ -111,11 +110,14 @@ enum drbd_req_event { BARRIER_ACKED, /* in protocol A and B */ DATA_RECEIVED, /* (remote read) */ + COMPLETED_OK, READ_COMPLETED_WITH_ERROR, READ_AHEAD_COMPLETED_WITH_ERROR, WRITE_COMPLETED_WITH_ERROR, + DISCARD_COMPLETED_NOTSUPP, + DISCARD_COMPLETED_WITH_ERROR, + ABORT_DISK_IO, - COMPLETED_OK, RESEND, FAIL_FROZEN_DISK_IO, RESTART_FROZEN_DISK_IO, diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 1a84345a3868..a5d8aae00e04 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -54,8 +54,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn); +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn); static inline bool is_susp(union drbd_state s) { @@ -287,7 +287,7 @@ _req_st_cond(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv >= SS_SUCCESS) rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ @@ -333,7 +333,7 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv < SS_SUCCESS) { spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -740,8 +740,8 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st * When we loose connection, we have to set the state of the peers disk (pdsk) * to D_UNKNOWN. This rule and many more along those lines are in this function. */ -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn) +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; @@ -882,11 +882,13 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st } if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { @@ -958,7 +960,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, os = drbd_read_state(device); - ns = sanitize_state(device, ns, &ssw); + ns = sanitize_state(device, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1656,7 +1658,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1718,7 +1720,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union number_of_volumes++; os = drbd_read_state(device); ns = apply_mask_val(os, mask, val); - ns = sanitize_state(device, ns, NULL); + ns = sanitize_state(device, os, ns, NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1763,19 +1765,19 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union static enum drbd_state_rv _conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) { - enum drbd_state_rv rv; + enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */; if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) - return SS_CW_SUCCESS; + rv = SS_CW_SUCCESS; if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) - return SS_CW_FAILED_BY_PEER; + rv = SS_CW_FAILED_BY_PEER; - rv = conn_is_valid_transition(connection, mask, val, 0); - if (rv == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) - rv = SS_UNKNOWN_ERROR; /* continue waiting */ + err = conn_is_valid_transition(connection, mask, val, 0); + if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) + return rv; - return rv; + return err; } enum drbd_state_rv diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 2c4ce42c3657..d8f57b6305cd 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver, final stage. */ -static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) +void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) { unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; @@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); - if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + /* FIXME do we want to detach for failed REQ_DISCARD? + * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ + if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error) struct drbd_device *device = peer_req->peer_device->device; int uptodate = bio_flagged(bio, BIO_UPTODATE); int is_write = bio_data_dir(bio) == WRITE; + int is_discard = !!(bio->bi_rw & REQ_DISCARD); if (error && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", - is_write ? "write" : "read", error, + is_write ? (is_discard ? "discard" : "write") + : "read", error, (unsigned long long)peer_req->i.sector); if (!error && !uptodate) { if (__ratelimit(&drbd_ratelimit_state)) @@ -263,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error) /* to avoid recursion in __req_mod */ if (unlikely(error)) { - what = (bio_data_dir(bio) == WRITE) + if (bio->bi_rw & REQ_DISCARD) + what = (error == -EOPNOTSUPP) + ? DISCARD_COMPLETED_NOTSUPP + : DISCARD_COMPLETED_WITH_ERROR; + else + what = (bio_data_dir(bio) == WRITE) ? WRITE_COMPLETED_WITH_ERROR : (bio_rw(bio) == READ) ? READ_COMPLETED_WITH_ERROR @@ -395,7 +404,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, - size, GFP_TRY); + size, true /* has real payload */, GFP_TRY); if (!peer_req) goto defer; @@ -492,10 +501,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size) return fb; } -static int drbd_rs_controller(struct drbd_device *device) +static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) { struct disk_conf *dc; - unsigned int sect_in; /* Number of sectors that came in since the last turn */ unsigned int want; /* The number of sectors we want in the proxy */ int req_sect; /* Number of sectors to request in this turn */ int correction; /* Number of sectors more we need in the proxy*/ @@ -505,9 +513,6 @@ static int drbd_rs_controller(struct drbd_device *device) int max_sect; struct fifo_buffer *plan; - sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */ - device->rs_in_flight -= sect_in; - dc = rcu_dereference(device->ldev->disk_conf); plan = rcu_dereference(device->rs_plan_s); @@ -550,11 +555,16 @@ static int drbd_rs_controller(struct drbd_device *device) static int drbd_rs_number_requests(struct drbd_device *device) { - int number; + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + int number, mxb; + + sect_in = atomic_xchg(&device->rs_sect_in, 0); + device->rs_in_flight -= sect_in; rcu_read_lock(); + mxb = drbd_get_max_buffers(device) / 2; if (rcu_dereference(device->rs_plan_s)->size) { - number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9); + number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; } else { device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; @@ -562,8 +572,14 @@ static int drbd_rs_number_requests(struct drbd_device *device) } rcu_read_unlock(); - /* ignore the amount of pending requests, the resync controller should - * throttle down to incoming reply rate soon enough anyways. */ + /* Don't have more than "max-buffers"/2 in-flight. + * Otherwise we may cause the remote site to stall on drbd_alloc_pages(), + * potentially causing a distributed deadlock on congestion during + * online-verify or (checksum-based) resync, if max-buffers, + * socket buffer sizes and resync rate settings are mis-configured. */ + if (mxb - device->rs_in_flight < number) + number = mxb - device->rs_in_flight; + return number; } @@ -597,7 +613,7 @@ static int make_resync_request(struct drbd_device *device, int cancel) max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; number = drbd_rs_number_requests(device); - if (number == 0) + if (number <= 0) goto requeue; for (i = 0; i < number; i++) { @@ -647,7 +663,7 @@ next_sector: */ align = 1; rollback_i = i; - for (;;) { + while (i < number) { if (size + BM_BLOCK_SIZE > max_bio_size) break; @@ -1670,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } clear_bit(B_RS_H_DONE, &device->flags); - write_lock_irq(&global_state_lock); + /* req_lock: serialize with drbd_send_and_submit() and others + * global_state_lock: for stable sync-after dependencies */ + spin_lock_irq(&device->resource->req_lock); + write_lock(&global_state_lock); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); mutex_unlock(device->state_mutex); return; } @@ -1714,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } _drbd_pause_after(device); } - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); if (r == SS_SUCCESS) { /* reset rs_last_bcast when a resync or verify is started, @@ -1778,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) mutex_unlock(device->state_mutex); } -/* If the resource already closed the current epoch, but we did not - * (because we have not yet seen new requests), we should send the - * corresponding barrier now. Must be checked within the same spinlock - * that is used to check for new requests. */ -static bool need_to_send_barrier(struct drbd_connection *connection) -{ - if (!connection->send.seen_any_write_yet) - return false; - - /* Skip barriers that do not contain any writes. - * This may happen during AHEAD mode. */ - if (!connection->send.current_epoch_writes) - return false; - - /* ->req_lock is held when requests are queued on - * connection->sender_work, and put into ->transfer_log. - * It is also held when ->current_tle_nr is increased. - * So either there are already new requests queued, - * and corresponding barriers will be send there. - * Or nothing new is queued yet, so the difference will be 1. - */ - if (atomic_read(&connection->current_tle_nr) != - connection->send.current_epoch_nr + 1) - return false; - - return true; -} - static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) { spin_lock_irq(&queue->q_lock); @@ -1864,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * spin_unlock_irq(&connection->resource->req_lock); break; } - send_barrier = need_to_send_barrier(connection); + + /* We found nothing new to do, no to-be-communicated request, + * no other work item. We may still need to close the last + * epoch. Next incoming request epoch will be connection -> + * current transfer log epoch number. If that is different + * from the epoch of the last request we communicated, it is + * safe to send the epoch separating barrier now. + */ + send_barrier = + atomic_read(&connection->current_tle_nr) != + connection->send.current_epoch_nr; spin_unlock_irq(&connection->resource->req_lock); - if (send_barrier) { - drbd_send_barrier(connection); - connection->send.current_epoch_nr++; - } + + if (send_barrier) + maybe_send_barrier(connection, + connection->send.current_epoch_nr + 1); schedule(); /* may be woken up for other things but new work, too, * e.g. if the current epoch got closed. diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h deleted file mode 100644 index 3db9ebaf64f6..000000000000 --- a/drivers/block/drbd/drbd_wrappers.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _DRBD_WRAPPERS_H -#define _DRBD_WRAPPERS_H - -#include <linux/ctype.h> -#include <linux/mm.h> -#include "drbd_int.h" - -/* see get_sb_bdev and bd_claim */ -extern char *drbd_sec_holder; - -/* sets the number of 512 byte sectors of our virtual device */ -static inline void drbd_set_my_capacity(struct drbd_device *device, - sector_t size) -{ - /* set_capacity(device->this_bdev->bd_disk, size); */ - set_capacity(device->vdisk, size); - device->this_bdev->bd_inode->i_size = (loff_t)size << 9; -} - -#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) - -/* bi_end_io handlers */ -extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); - -/* - * used to submit our private bio - */ -static inline void drbd_generic_make_request(struct drbd_device *device, - int fault_type, struct bio *bio) -{ - __release(local); - if (!bio->bi_bdev) { - printk(KERN_ERR "drbd%d: drbd_generic_make_request: " - "bio->bi_bdev == NULL\n", - device_to_minor(device)); - dump_stack(); - bio_endio(bio, -ENODEV); - return; - } - - if (drbd_insert_fault(device, fault_type)) - bio_endio(bio, -EIO); - else - generic_make_request(bio); -} - -#ifndef __CHECKER__ -# undef __cond_lock -# define __cond_lock(x,c) (c) -#endif - -#endif |