diff options
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 21 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 151 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 158 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 52 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 19 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 666 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 40 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_strings.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 206 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_wrappers.h | 16 |
10 files changed, 903 insertions, 428 deletions
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 3390716898d5..e3f88d6e1412 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -84,6 +84,9 @@ struct drbd_bitmap { #define BM_MD_IO_ERROR 1 #define BM_P_VMALLOCED 2 +static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, + unsigned long e, int val, const enum km_type km); + static int bm_is_locked(struct drbd_bitmap *b) { return test_bit(BM_LOCKED, &b->bm_flags); @@ -441,7 +444,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) * In case this is actually a resize, we copy the old bitmap into the new one. * Otherwise, the bitmap is initialized to all bits set. */ -int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) +int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) { struct drbd_bitmap *b = mdev->bitmap; unsigned long bits, words, owords, obits, *p_addr, *bm; @@ -516,7 +519,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) obits = b->bm_bits; growing = bits > obits; - if (opages) + if (opages && growing && set_new_bits) bm_set_surplus(b); b->bm_pages = npages; @@ -526,8 +529,12 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) b->bm_dev_capacity = capacity; if (growing) { - bm_memset(b, owords, 0xff, words-owords); - b->bm_set += bits - obits; + if (set_new_bits) { + bm_memset(b, owords, 0xff, words-owords); + b->bm_set += bits - obits; + } else + bm_memset(b, owords, 0x00, words-owords); + } if (want < have) { @@ -773,7 +780,7 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int /* nothing to do, on disk == in memory */ # define bm_cpu_to_lel(x) ((void)0) # else -void bm_cpu_to_lel(struct drbd_bitmap *b) +static void bm_cpu_to_lel(struct drbd_bitmap *b) { /* need to cpu_to_lel all the pages ... * this may be optimized by using @@ -1015,7 +1022,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f * wants bitnr, not sector. * expected to be called for only a few bits (e - s about BITS_PER_LONG). * Must hold bitmap lock already. */ -int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, +static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, unsigned long e, int val, const enum km_type km) { struct drbd_bitmap *b = mdev->bitmap; @@ -1053,7 +1060,7 @@ int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, * for val != 0, we change 0 -> 1, return code positive * for val == 0, we change 1 -> 0, return code negative * wants bitnr, not sector */ -int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, +static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, const unsigned long e, int val) { unsigned long flags; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e5e86a781820..e9654c8d5b62 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -132,6 +132,7 @@ enum { DRBD_FAULT_DT_RA = 6, /* data read ahead */ DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ DRBD_FAULT_AL_EE = 8, /* alloc ee */ + DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */ DRBD_FAULT_MAX, }; @@ -208,8 +209,11 @@ enum drbd_packets { P_RS_IS_IN_SYNC = 0x22, /* meta socket */ P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ + /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ + /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ + P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ - P_MAX_CMD = 0x25, + P_MAX_CMD = 0x28, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -264,6 +268,7 @@ static inline const char *cmdname(enum drbd_packets cmd) [P_CSUM_RS_REQUEST] = "CsumRSRequest", [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", [P_COMPRESSED_BITMAP] = "CBitmap", + [P_DELAY_PROBE] = "DelayProbe", [P_MAX_CMD] = NULL, }; @@ -481,7 +486,8 @@ struct p_sizes { u64 u_size; /* user requested size */ u64 c_size; /* current exported size */ u32 max_segment_size; /* Maximal size of a BIO */ - u32 queue_order_type; + u16 queue_order_type; /* not yet implemented in DRBD*/ + u16 dds_flags; /* use enum dds_flags here. */ } __packed; struct p_state { @@ -538,6 +544,18 @@ struct p_compressed_bm { u8 code[0]; } __packed; +struct p_delay_probe { + struct p_header head; + u32 seq_num; /* sequence number to match the two probe packets */ + u32 offset; /* usecs the probe got sent after the reference time point */ +} __packed; + +struct delay_probe { + struct list_head list; + unsigned int seq_num; + struct timeval time; +}; + /* DCBP: Drbd Compressed Bitmap Packet ... */ static inline enum drbd_bitmap_code DCBP_get_code(struct p_compressed_bm *p) @@ -722,22 +740,6 @@ enum epoch_event { EV_CLEANUP = 32, /* used as flag */ }; -struct drbd_epoch_entry { - struct drbd_work w; - struct drbd_conf *mdev; - struct bio *private_bio; - struct hlist_node colision; - sector_t sector; - unsigned int size; - struct drbd_epoch *epoch; - - /* up to here, the struct layout is identical to drbd_request; - * we might be able to use that to our advantage... */ - - unsigned int flags; - u64 block_id; -}; - struct drbd_wq_barrier { struct drbd_work w; struct completion done; @@ -748,17 +750,49 @@ struct digest_info { void *digest; }; -/* ee flag bits */ +struct drbd_epoch_entry { + struct drbd_work w; + struct hlist_node colision; + struct drbd_epoch *epoch; + struct drbd_conf *mdev; + struct page *pages; + atomic_t pending_bios; + unsigned int size; + /* see comments on ee flag bits below */ + unsigned long flags; + sector_t sector; + u64 block_id; +}; + +/* ee flag bits. + * While corresponding bios are in flight, the only modification will be + * set_bit WAS_ERROR, which has to be atomic. + * If no bios are in flight yet, or all have been completed, + * non-atomic modification to ee->flags is ok. + */ enum { __EE_CALL_AL_COMPLETE_IO, - __EE_CONFLICT_PENDING, __EE_MAY_SET_IN_SYNC, + + /* This epoch entry closes an epoch using a barrier. + * On sucessful completion, the epoch is released, + * and the P_BARRIER_ACK send. */ __EE_IS_BARRIER, + + /* In case a barrier failed, + * we need to resubmit without the barrier flag. */ + __EE_RESUBMITTED, + + /* we may have several bios per epoch entry. + * if any of those fail, we set this flag atomically + * from the endio callback */ + __EE_WAS_ERROR, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) -#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) /* global flag bits */ enum { @@ -908,9 +942,12 @@ struct drbd_conf { unsigned int ko_count; struct drbd_work resync_work, unplug_work, - md_sync_work; + md_sync_work, + delay_probe_work, + uuid_work; struct timer_list resync_timer; struct timer_list md_sync_timer; + struct timer_list delay_probe_timer; /* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp; @@ -1026,6 +1063,13 @@ struct drbd_conf { u64 ed_uuid; /* UUID of the exposed data */ struct mutex state_mutex; char congestion_reason; /* Why we where congested... */ + struct list_head delay_probes; /* protected by peer_seq_lock */ + int data_delay; /* Delay of packets on the data-sock behind meta-sock */ + unsigned int delay_seq; /* To generate sequence numbers of delay probes */ + struct timeval dps_time; /* delay-probes-start-time */ + unsigned int dp_volume_last; /* send_cnt of last delay probe */ + int c_sync_rate; /* current resync rate after delay_probe magic */ + atomic_t new_c_uuid; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1081,6 +1125,11 @@ enum chg_state_flags { CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, }; +enum dds_flags { + DDSF_FORCED = 1, + DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ +}; + extern void drbd_init_set_defaults(struct drbd_conf *mdev); extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, union drbd_state mask, union drbd_state val); @@ -1113,7 +1162,7 @@ extern int drbd_send_protocol(struct drbd_conf *mdev); extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); -extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); +extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); extern int _drbd_send_state(struct drbd_conf *mdev); extern int drbd_send_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, @@ -1311,7 +1360,7 @@ struct bm_extent { #define APP_R_HSIZE 15 extern int drbd_bm_init(struct drbd_conf *mdev); -extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); +extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); extern void drbd_bm_cleanup(struct drbd_conf *mdev); extern void drbd_bm_set_all(struct drbd_conf *mdev); extern void drbd_bm_clear_all(struct drbd_conf *mdev); @@ -1383,7 +1432,7 @@ extern void drbd_resume_io(struct drbd_conf *mdev); extern char *ppsize(char *buf, unsigned long long size); extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; -extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local); +extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); extern void resync_after_online_grow(struct drbd_conf *); extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, @@ -1414,7 +1463,8 @@ static inline void ov_oos_print(struct drbd_conf *mdev) } -extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); +extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); +extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *); /* worker callbacks */ extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); @@ -1438,6 +1488,8 @@ extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); /* drbd_receiver.c */ +extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + const unsigned rw, const int fault_type); extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, u64 id, @@ -1593,6 +1645,41 @@ void drbd_bcast_ee(struct drbd_conf *mdev, * inline helper functions *************************/ +/* see also page_chain_add and friends in drbd_receiver.c */ +static inline struct page *page_chain_next(struct page *page) +{ + return (struct page *)page_private(page); +} +#define page_chain_for_each(page) \ + for (; page && ({ prefetch(page_chain_next(page)); 1; }); \ + page = page_chain_next(page)) +#define page_chain_for_each_safe(page, n) \ + for (; page && ({ n = page_chain_next(page); 1; }); page = n) + +static inline int drbd_bio_has_active_page(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + if (page_count(bvec->bv_page) > 1) + return 1; + } + + return 0; +} + +static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) +{ + struct page *page = e->pages; + page_chain_for_each(page) { + if (page_count(page) > 1) + return 1; + } + return 0; +} + + static inline void drbd_state_lock(struct drbd_conf *mdev) { wait_event(mdev->misc_wait, @@ -2132,13 +2219,15 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) return 0; if (test_bit(BITMAP_IO, &mdev->flags)) return 0; + if (atomic_read(&mdev->new_c_uuid)) + return 0; return 1; } /* I'd like to use wait_event_lock_irq, * but I'm not sure when it got introduced, * and not sure when it has 3 or 4 arguments */ -static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) +static inline void inc_ap_bio(struct drbd_conf *mdev, int count) { /* compare with after_state_ch, * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ @@ -2152,6 +2241,9 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) * to avoid races with the reconnect code, * we need to atomic_inc within the spinlock. */ + if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1)) + drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work); + spin_lock_irq(&mdev->req_lock); while (!__inc_ap_bio_cond(mdev)) { prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -2160,7 +2252,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) finish_wait(&mdev->misc_wait, &wait); spin_lock_irq(&mdev->req_lock); } - atomic_add(one_or_two, &mdev->ap_bio_cnt); + atomic_add(count, &mdev->ap_bio_cnt); spin_unlock_irq(&mdev->req_lock); } @@ -2251,7 +2343,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) if (test_bit(MD_NO_BARRIER, &mdev->flags)) return; - r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); + r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); if (r) { set_bit(MD_NO_BARRIER, &mdev->flags); dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 93d1f9b469d4..be2d2da9cdba 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -684,6 +684,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) rv = SS_NO_REMOTE_DISK; + else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + else if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S || ns.conn == C_SYNC_SOURCE || @@ -840,7 +843,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state break; case C_WF_BITMAP_S: case C_PAUSED_SYNC_S: - ns.pdsk = D_OUTDATED; + /* remap any consistent state to D_OUTDATED, + * but disallow "upgrade" of not even consistent states. + */ + ns.pdsk = + (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED) + ? os.pdsk : D_OUTDATED; break; case C_SYNC_SOURCE: ns.pdsk = D_INCONSISTENT; @@ -1205,21 +1213,20 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, && (ns.pdsk < D_INCONSISTENT || ns.pdsk == D_UNKNOWN || ns.pdsk == D_OUTDATED)) { - kfree(mdev->p_uuid); - mdev->p_uuid = NULL; if (get_ldev(mdev)) { if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); - } + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE && + !atomic_read(&mdev->new_c_uuid)) + atomic_set(&mdev->new_c_uuid, 2); put_ldev(mdev); } } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) - drbd_uuid_new_current(mdev); + /* Diskless peer becomes primary or got connected do diskless, primary peer. */ + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 && + !atomic_read(&mdev->new_c_uuid)) + atomic_set(&mdev->new_c_uuid, 2); /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) @@ -1232,7 +1239,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ - drbd_send_sizes(mdev, 0); /* to start sync... */ + drbd_send_sizes(mdev, 0, 0); /* to start sync... */ drbd_send_uuids(mdev); drbd_send_state(mdev); } @@ -1343,6 +1350,24 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_md_sync(mdev); } +static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + if (get_ldev(mdev)) { + if (mdev->ldev->md.uuid[UI_BITMAP] == 0) { + drbd_uuid_new_current(mdev); + if (get_net_conf(mdev)) { + drbd_send_uuids(mdev); + put_net_conf(mdev); + } + drbd_md_sync(mdev); + } + put_ldev(mdev); + } + atomic_dec(&mdev->new_c_uuid); + wake_up(&mdev->misc_wait); + + return 1; +} static int drbd_thread_setup(void *arg) { @@ -1755,7 +1780,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) (struct p_header *)&p, sizeof(p)); } -int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) +int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) { struct p_sizes p; sector_t d_size, u_size; @@ -1767,7 +1792,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) d_size = drbd_get_max_capacity(mdev->ldev); u_size = mdev->ldev->dc.disk_size; q_order_type = drbd_queue_order_type(mdev); - p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); put_ldev(mdev); } else { d_size = 0; @@ -1779,7 +1803,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); - p.queue_order_type = cpu_to_be32(q_order_type); + p.queue_order_type = cpu_to_be16(q_order_type); + p.dds_flags = cpu_to_be16(flags); ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, (struct p_header *)&p, sizeof(p)); @@ -2180,6 +2205,43 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) return ok; } +static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds) +{ + struct p_delay_probe dp; + int offset, ok = 0; + struct timeval now; + + mutex_lock(&ds->mutex); + if (likely(ds->socket)) { + do_gettimeofday(&now); + offset = now.tv_usec - mdev->dps_time.tv_usec + + (now.tv_sec - mdev->dps_time.tv_sec) * 1000000; + dp.seq_num = cpu_to_be32(mdev->delay_seq); + dp.offset = cpu_to_be32(offset); + + ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE, + (struct p_header *)&dp, sizeof(dp), 0); + } + mutex_unlock(&ds->mutex); + + return ok; +} + +static int drbd_send_delay_probes(struct drbd_conf *mdev) +{ + int ok; + + mdev->delay_seq++; + do_gettimeofday(&mdev->dps_time); + ok = drbd_send_delay_probe(mdev, &mdev->meta); + ok = ok && drbd_send_delay_probe(mdev, &mdev->data); + + mdev->dp_volume_last = mdev->send_cnt; + mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10); + + return ok; +} + /* called on sndtimeo * returns FALSE if we should retry, * TRUE if we think connection is dead @@ -2309,6 +2371,44 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) return 1; } +static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +{ + struct page *page = e->pages; + unsigned len = e->size; + page_chain_for_each(page) { + unsigned l = min_t(unsigned, len, PAGE_SIZE); + if (!_drbd_send_page(mdev, page, 0, l)) + return 0; + len -= l; + } + return 1; +} + +static void consider_delay_probes(struct drbd_conf *mdev) +{ + if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93) + return; + + if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt) + drbd_send_delay_probes(mdev); +} + +static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + if (!cancel && mdev->state.conn == C_SYNC_SOURCE) + drbd_send_delay_probes(mdev); + + return 1; +} + +static void delay_probe_timer_fn(unsigned long data) +{ + struct drbd_conf *mdev = (struct drbd_conf *) data; + + if (list_empty(&mdev->delay_probe_work.list)) + drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work); +} + /* Used to send write requests * R_PRIMARY -> Peer (P_DATA) */ @@ -2360,7 +2460,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); if (ok && dgs) { dgb = mdev->int_dig_out; - drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); + drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); } if (ok) { @@ -2371,6 +2471,10 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) } drbd_put_data_sock(mdev); + + if (ok) + consider_delay_probes(mdev); + return ok; } @@ -2409,13 +2513,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, sizeof(p), MSG_MORE); if (ok && dgs) { dgb = mdev->int_dig_out; - drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); + drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); } if (ok) - ok = _drbd_send_zc_bio(mdev, e->private_bio); + ok = _drbd_send_zc_ee(mdev, e); drbd_put_data_sock(mdev); + + if (ok) + consider_delay_probes(mdev); + return ok; } @@ -2600,6 +2708,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->net_cnt, 0); atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); + atomic_set(&mdev->new_c_uuid, 0); mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); @@ -2628,16 +2737,26 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) INIT_LIST_HEAD(&mdev->unplug_work.list); INIT_LIST_HEAD(&mdev->md_sync_work.list); INIT_LIST_HEAD(&mdev->bm_io_work.w.list); + INIT_LIST_HEAD(&mdev->delay_probes); + INIT_LIST_HEAD(&mdev->delay_probe_work.list); + INIT_LIST_HEAD(&mdev->uuid_work.list); + mdev->resync_work.cb = w_resync_inactive; mdev->unplug_work.cb = w_send_write_hint; mdev->md_sync_work.cb = w_md_sync; mdev->bm_io_work.w.cb = w_bitmap_io; + mdev->delay_probe_work.cb = w_delay_probes; + mdev->uuid_work.cb = w_new_current_uuid; init_timer(&mdev->resync_timer); init_timer(&mdev->md_sync_timer); + init_timer(&mdev->delay_probe_timer); mdev->resync_timer.function = resync_timer_fn; mdev->resync_timer.data = (unsigned long) mdev; mdev->md_sync_timer.function = md_sync_timer_fn; mdev->md_sync_timer.data = (unsigned long) mdev; + mdev->delay_probe_timer.function = delay_probe_timer_fn; + mdev->delay_probe_timer.data = (unsigned long) mdev; + init_waitqueue_head(&mdev->misc_wait); init_waitqueue_head(&mdev->state_wait); @@ -2680,7 +2799,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) drbd_set_my_capacity(mdev, 0); if (mdev->bitmap) { /* maybe never allocated. */ - drbd_bm_resize(mdev, 0); + drbd_bm_resize(mdev, 0, 1); drbd_bm_cleanup(mdev); } @@ -3129,7 +3248,7 @@ int __init drbd_init(void) if (err) goto Enomem; - drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); + drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); if (!drbd_proc) { printk(KERN_ERR "drbd: unable to register proc file\n"); goto Enomem; @@ -3660,7 +3779,8 @@ _drbd_fault_str(unsigned int type) { [DRBD_FAULT_DT_RD] = "Data read", [DRBD_FAULT_DT_RA] = "Data read ahead", [DRBD_FAULT_BM_ALLOC] = "BM allocation", - [DRBD_FAULT_AL_EE] = "EE allocation" + [DRBD_FAULT_AL_EE] = "EE allocation", + [DRBD_FAULT_RECEIVE] = "receive data corruption", }; return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6429d2b19e06..632e3245d1bb 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev) * Returns 0 on success, negative return values indicate errors. * You should call drbd_md_sync() after calling this function. */ -enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local) +enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ sector_t la_size; @@ -541,12 +541,12 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force /* TODO: should only be some assert here, not (re)init... */ drbd_md_set_sector_offsets(mdev, mdev->ldev); - size = drbd_new_dev_size(mdev, mdev->ldev, force); + size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED); if (drbd_get_capacity(mdev->this_bdev) != size || drbd_bm_capacity(mdev) != size) { int err; - err = drbd_bm_resize(mdev, size); + err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC)); if (unlikely(err)) { /* currently there is only one error: ENOMEM! */ size = drbd_bm_capacity(mdev)>>1; @@ -704,9 +704,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; int max_segments = mdev->ldev->dc.max_bio_bvecs; - if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) - max_seg_s = PAGE_SIZE; - max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); blk_queue_max_hw_sectors(q, max_seg_s >> 9); @@ -1199,13 +1196,12 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } /* allocation not in the IO path, cqueue thread context */ - new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); + new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); if (!new_conf) { retcode = ERR_NOMEM; goto fail; } - memset(new_conf, 0, sizeof(struct net_conf)); new_conf->timeout = DRBD_TIMEOUT_DEF; new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; new_conf->ping_int = DRBD_PING_INT_DEF; @@ -1477,8 +1473,8 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, { struct resize rs; int retcode = NO_ERROR; - int ldsc = 0; /* local disk size changed */ enum determine_dev_size dd; + enum dds_flags ddsf; memset(&rs, 0, sizeof(struct resize)); if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { @@ -1502,13 +1498,17 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, goto fail; } - if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { - mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); - ldsc = 1; + if (rs.no_resync && mdev->agreed_pro_version < 93) { + retcode = ERR_NEED_APV_93; + goto fail; } + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); + mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; - dd = drbd_determin_dev_size(mdev, rs.resize_force); + ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); + dd = drbd_determin_dev_size(mdev, ddsf); drbd_md_sync(mdev); put_ldev(mdev); if (dd == dev_size_error) { @@ -1516,12 +1516,12 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, goto fail; } - if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { + if (mdev->state.conn == C_CONNECTED) { if (dd == grew) set_bit(RESIZE_PENDING, &mdev->flags); drbd_send_uuids(mdev); - drbd_send_sizes(mdev, 1); + drbd_send_sizes(mdev, 1, ddsf); } fail: @@ -1551,6 +1551,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n sc.rate = DRBD_RATE_DEF; sc.after = DRBD_AFTER_DEF; sc.al_extents = DRBD_AL_EXTENTS_DEF; + sc.dp_volume = DRBD_DP_VOLUME_DEF; + sc.dp_interval = DRBD_DP_INTERVAL_DEF; + sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF; + sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF; } else memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); @@ -2207,9 +2211,9 @@ void drbd_bcast_ee(struct drbd_conf *mdev, { struct cn_msg *cn_reply; struct drbd_nl_cfg_reply *reply; - struct bio_vec *bvec; unsigned short *tl; - int i; + struct page *page; + unsigned len; if (!e) return; @@ -2247,11 +2251,15 @@ void drbd_bcast_ee(struct drbd_conf *mdev, put_unaligned(T_ee_data, tl++); put_unaligned(e->size, tl++); - __bio_for_each_segment(bvec, e->private_bio, i, 0) { - void *d = kmap(bvec->bv_page); - memcpy(tl, d + bvec->bv_offset, bvec->bv_len); - kunmap(bvec->bv_page); - tl=(unsigned short*)((char*)tl + bvec->bv_len); + len = e->size; + page = e->pages; + page_chain_for_each(page) { + void *d = kmap_atomic(page, KM_USER0); + unsigned l = min_t(unsigned, len, PAGE_SIZE); + memcpy(tl, d, l); + kunmap_atomic(d, KM_USER0); + tl = (unsigned short*)((char*)tl + l); + len -= l; } put_unaligned(TT_END, tl++); /* Close the tag list */ diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index be3374b68460..d0f1767ea4c3 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -73,14 +73,21 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); /* if more than 1 GB display in MB */ if (mdev->rs_total > 0x100000L) - seq_printf(seq, "(%lu/%lu)M\n\t", + seq_printf(seq, "(%lu/%lu)M", (unsigned long) Bit2KB(rs_left >> 10), (unsigned long) Bit2KB(mdev->rs_total >> 10)); else - seq_printf(seq, "(%lu/%lu)K\n\t", + seq_printf(seq, "(%lu/%lu)K", (unsigned long) Bit2KB(rs_left), (unsigned long) Bit2KB(mdev->rs_total)); + if (mdev->state.conn == C_SYNC_TARGET) + seq_printf(seq, " queue_delay: %d.%d ms\n\t", + mdev->data_delay / 1000, + (mdev->data_delay % 1000) / 100); + else if (mdev->state.conn == C_SYNC_SOURCE) + seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq); + /* see drivers/md/md.c * We do not want to overflow, so the order of operands and * the * 100 / 100 trick are important. We do a +1 to be @@ -128,6 +135,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) else seq_printf(seq, " (%ld)", dbdt); + if (mdev->state.conn == C_SYNC_TARGET) { + if (mdev->c_sync_rate > 1000) + seq_printf(seq, " want: %d,%03d", + mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000); + else + seq_printf(seq, " want: %d", mdev->c_sync_rate); + } + seq_printf(seq, " K/sec\n"); } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 3f096e7959b4..bc9ab7fb2cc7 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -80,30 +80,128 @@ static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epo #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) -static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) +/* + * some helper functions to deal with single linked page lists, + * page->private being our "next" pointer. + */ + +/* If at least n pages are linked at head, get n pages off. + * Otherwise, don't modify head, and return NULL. + * Locking is the responsibility of the caller. + */ +static struct page *page_chain_del(struct page **head, int n) +{ + struct page *page; + struct page *tmp; + + BUG_ON(!n); + BUG_ON(!head); + + page = *head; + + if (!page) + return NULL; + + while (page) { + tmp = page_chain_next(page); + if (--n == 0) + break; /* found sufficient pages */ + if (tmp == NULL) + /* insufficient pages, don't use any of them. */ + return NULL; + page = tmp; + } + + /* add end of list marker for the returned list */ + set_page_private(page, 0); + /* actual return value, and adjustment of head */ + page = *head; + *head = tmp; + return page; +} + +/* may be used outside of locks to find the tail of a (usually short) + * "private" page chain, before adding it back to a global chain head + * with page_chain_add() under a spinlock. */ +static struct page *page_chain_tail(struct page *page, int *len) +{ + struct page *tmp; + int i = 1; + while ((tmp = page_chain_next(page))) + ++i, page = tmp; + if (len) + *len = i; + return page; +} + +static int page_chain_free(struct page *page) +{ + struct page *tmp; + int i = 0; + page_chain_for_each_safe(page, tmp) { + put_page(page); + ++i; + } + return i; +} + +static void page_chain_add(struct page **head, + struct page *chain_first, struct page *chain_last) +{ +#if 1 + struct page *tmp; + tmp = page_chain_tail(chain_first, NULL); + BUG_ON(tmp != chain_last); +#endif + + /* add chain to head */ + set_page_private(chain_last, (unsigned long)*head); + *head = chain_first; +} + +static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number) { struct page *page = NULL; + struct page *tmp = NULL; + int i = 0; /* Yes, testing drbd_pp_vacant outside the lock is racy. * So what. It saves a spin_lock. */ - if (drbd_pp_vacant > 0) { + if (drbd_pp_vacant >= number) { spin_lock(&drbd_pp_lock); - page = drbd_pp_pool; - if (page) { - drbd_pp_pool = (struct page *)page_private(page); - set_page_private(page, 0); /* just to be polite */ - drbd_pp_vacant--; - } + page = page_chain_del(&drbd_pp_pool, number); + if (page) + drbd_pp_vacant -= number; spin_unlock(&drbd_pp_lock); + if (page) + return page; } + /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - if (!page) - page = alloc_page(GFP_TRY); - if (page) - atomic_inc(&mdev->pp_in_use); - return page; + for (i = 0; i < number; i++) { + tmp = alloc_page(GFP_TRY); + if (!tmp) + break; + set_page_private(tmp, (unsigned long)page); + page = tmp; + } + + if (i == number) + return page; + + /* Not enough pages immediately available this time. + * No need to jump around here, drbd_pp_alloc will retry this + * function "soon". */ + if (page) { + tmp = page_chain_tail(page, NULL); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); + } + return NULL; } /* kick lower level device, if we have more than (arbitrary number) @@ -127,7 +225,7 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed list_for_each_safe(le, tle, &mdev->net_ee) { e = list_entry(le, struct drbd_epoch_entry, w.list); - if (drbd_bio_has_active_page(e->private_bio)) + if (drbd_ee_has_active_page(e)) break; list_move(le, to_be_freed); } @@ -148,32 +246,34 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) } /** - * drbd_pp_alloc() - Returns a page, fails only if a signal comes in + * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled) * @mdev: DRBD device. - * @retry: whether or not to retry allocation forever (or until signalled) + * @number: number of pages requested + * @retry: whether to retry, if not enough pages are available right now + * + * Tries to allocate number pages, first from our own page pool, then from + * the kernel, unless this allocation would exceed the max_buffers setting. + * Possibly retry until DRBD frees sufficient pages somewhere else. * - * Tries to allocate a page, first from our own page pool, then from the - * kernel, unless this allocation would exceed the max_buffers setting. - * If @retry is non-zero, retry until DRBD frees a page somewhere else. + * Returns a page chain linked via page->private. */ -static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) +static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry) { struct page *page = NULL; DEFINE_WAIT(wait); - if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { - page = drbd_pp_first_page_or_try_alloc(mdev); - if (page) - return page; - } + /* Yes, we may run up to @number over max_buffers. If we + * follow it strictly, the admin will get it wrong anyways. */ + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) + page = drbd_pp_first_pages_or_try_alloc(mdev, number); - for (;;) { + while (page == NULL) { prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); drbd_kick_lo_and_reclaim_net(mdev); if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { - page = drbd_pp_first_page_or_try_alloc(mdev); + page = drbd_pp_first_pages_or_try_alloc(mdev, number); if (page) break; } @@ -190,62 +290,32 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) } finish_wait(&drbd_pp_wait, &wait); + if (page) + atomic_add(number, &mdev->pp_in_use); return page; } /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. - * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ + * Is also used from inside an other spin_lock_irq(&mdev->req_lock); + * Either links the page chain back to the global pool, + * or returns all pages to the system. */ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) { - int free_it; - - spin_lock(&drbd_pp_lock); - if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { - free_it = 1; - } else { - set_page_private(page, (unsigned long)drbd_pp_pool); - drbd_pp_pool = page; - drbd_pp_vacant++; - free_it = 0; - } - spin_unlock(&drbd_pp_lock); - - atomic_dec(&mdev->pp_in_use); - - if (free_it) - __free_page(page); - - wake_up(&drbd_pp_wait); -} - -static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) -{ - struct page *p_to_be_freed = NULL; - struct page *page; - struct bio_vec *bvec; int i; - - spin_lock(&drbd_pp_lock); - __bio_for_each_segment(bvec, bio, i, 0) { - if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { - set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); - p_to_be_freed = bvec->bv_page; - } else { - set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); - drbd_pp_pool = bvec->bv_page; - drbd_pp_vacant++; - } - } - spin_unlock(&drbd_pp_lock); - atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); - - while (p_to_be_freed) { - page = p_to_be_freed; - p_to_be_freed = (struct page *)page_private(page); - set_page_private(page, 0); /* just to be polite */ - put_page(page); + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) + i = page_chain_free(page); + else { + struct page *tmp; + tmp = page_chain_tail(page, &i); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); } - + atomic_sub(i, &mdev->pp_in_use); + i = atomic_read(&mdev->pp_in_use); + if (i < 0) + dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); wake_up(&drbd_pp_wait); } @@ -270,11 +340,9 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, unsigned int data_size, gfp_t gfp_mask) __must_hold(local) { - struct request_queue *q; struct drbd_epoch_entry *e; struct page *page; - struct bio *bio; - unsigned int ds; + unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) return NULL; @@ -286,84 +354,32 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, return NULL; } - bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); - if (!bio) { - if (!(gfp_mask & __GFP_NOWARN)) - dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); - goto fail1; - } - - bio->bi_bdev = mdev->ldev->backing_bdev; - bio->bi_sector = sector; - - ds = data_size; - while (ds) { - page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); - if (!page) { - if (!(gfp_mask & __GFP_NOWARN)) - dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); - goto fail2; - } - if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { - drbd_pp_free(mdev, page); - dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," - "data_size=%u,ds=%u) failed\n", - (unsigned long long)sector, data_size, ds); - - q = bdev_get_queue(bio->bi_bdev); - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, - .bi_rw = bio->bi_rw, - }; - int l = q->merge_bvec_fn(q, &bvm, - &bio->bi_io_vec[bio->bi_vcnt]); - dev_err(DEV, "merge_bvec_fn() = %d\n", l); - } - - /* dump more of the bio. */ - dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); - dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); - dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); - dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); - - goto fail2; - break; - } - ds -= min_t(int, ds, PAGE_SIZE); - } - - D_ASSERT(data_size == bio->bi_size); - - bio->bi_private = e; - e->mdev = mdev; - e->sector = sector; - e->size = bio->bi_size; + page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); + if (!page) + goto fail; - e->private_bio = bio; - e->block_id = id; INIT_HLIST_NODE(&e->colision); e->epoch = NULL; + e->mdev = mdev; + e->pages = page; + atomic_set(&e->pending_bios, 0); + e->size = data_size; e->flags = 0; + e->sector = sector; + e->sector = sector; + e->block_id = id; return e; - fail2: - drbd_pp_free_bio_pages(mdev, bio); - bio_put(bio); - fail1: + fail: mempool_free(e, drbd_ee_mempool); - return NULL; } void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) { - struct bio *bio = e->private_bio; - drbd_pp_free_bio_pages(mdev, bio); - bio_put(bio); + drbd_pp_free(mdev, e->pages); + D_ASSERT(atomic_read(&e->pending_bios) == 0); D_ASSERT(hlist_unhashed(&e->colision)); mempool_free(e, drbd_ee_mempool); } @@ -902,7 +918,7 @@ retry: if (!drbd_send_protocol(mdev)) return -1; drbd_send_sync_param(mdev, &mdev->sync_conf); - drbd_send_sizes(mdev, 0); + drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); drbd_send_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); @@ -946,7 +962,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d int rv; if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { - rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); + rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); if (rv) { dev_err(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. @@ -1120,6 +1137,101 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) } /** + * drbd_submit_ee() + * @mdev: DRBD device. + * @e: epoch entry + * @rw: flag field, see bio->bi_rw + */ +/* TODO allocate from our own bio_set. */ +int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + const unsigned rw, const int fault_type) +{ + struct bio *bios = NULL; + struct bio *bio; + struct page *page = e->pages; + sector_t sector = e->sector; + unsigned ds = e->size; + unsigned n_bios = 0; + unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; + + if (atomic_read(&mdev->new_c_uuid)) { + if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) { + drbd_uuid_new_current(mdev); + drbd_md_sync(mdev); + + atomic_dec(&mdev->new_c_uuid); + wake_up(&mdev->misc_wait); + } + wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid)); + } + + /* In most cases, we will only need one bio. But in case the lower + * level restrictions happen to be different at this offset on this + * side than those of the sending peer, we may need to submit the + * request in more than one bio. */ +next_bio: + bio = bio_alloc(GFP_NOIO, nr_pages); + if (!bio) { + dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); + goto fail; + } + /* > e->sector, unless this is the first bio */ + bio->bi_sector = sector; + bio->bi_bdev = mdev->ldev->backing_bdev; + /* we special case some flags in the multi-bio case, see below + * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */ + bio->bi_rw = rw; + bio->bi_private = e; + bio->bi_end_io = drbd_endio_sec; + + bio->bi_next = bios; + bios = bio; + ++n_bios; + + page_chain_for_each(page) { + unsigned len = min_t(unsigned, ds, PAGE_SIZE); + if (!bio_add_page(bio, page, len, 0)) { + /* a single page must always be possible! */ + BUG_ON(bio->bi_vcnt == 0); + goto next_bio; + } + ds -= len; + sector += len >> 9; + --nr_pages; + } + D_ASSERT(page == NULL); + D_ASSERT(ds == 0); + + atomic_set(&e->pending_bios, n_bios); + do { + bio = bios; + bios = bios->bi_next; + bio->bi_next = NULL; + + /* strip off BIO_RW_UNPLUG unless it is the last bio */ + if (bios) + bio->bi_rw &= ~(1<<BIO_RW_UNPLUG); + + drbd_generic_make_request(mdev, fault_type, bio); + + /* strip off BIO_RW_BARRIER, + * unless it is the first or last bio */ + if (bios && bios->bi_next) + bios->bi_rw &= ~(1<<BIO_RW_BARRIER); + } while (bios); + maybe_kick_lo(mdev); + return 0; + +fail: + while (bios) { + bio = bios; + bios = bios->bi_next; + bio_put(bio); + } + return -ENOMEM; +} + +/** * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set * @mdev: DRBD device. * @w: work object. @@ -1128,8 +1240,6 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) { struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; - struct bio *bio = e->private_bio; - /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) so that we can finish that epoch in drbd_may_finish_epoch(). @@ -1143,33 +1253,17 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea if (previous_epoch(mdev, e->epoch)) dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); - /* prepare bio for re-submit, - * re-init volatile members */ /* we still have a local reference, * get_ldev was done in receive_Data. */ - bio->bi_bdev = mdev->ldev->backing_bdev; - bio->bi_sector = e->sector; - bio->bi_size = e->size; - bio->bi_idx = 0; - - bio->bi_flags &= ~(BIO_POOL_MASK - 1); - bio->bi_flags |= 1 << BIO_UPTODATE; - - /* don't know whether this is necessary: */ - bio->bi_phys_segments = 0; - bio->bi_next = NULL; - - /* these should be unchanged: */ - /* bio->bi_end_io = drbd_endio_write_sec; */ - /* bio->bi_vcnt = whatever; */ e->w.cb = e_end_block; - - /* This is no longer a barrier request. */ - bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); - - drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); - + if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) { + /* drbd_submit_ee fails for one reason only: + * if was not able to allocate sufficient bios. + * requeue, try again later. */ + e->w.cb = w_e_reissue; + drbd_queue_work(&mdev->data.work, &e->w); + } return 1; } @@ -1261,13 +1355,13 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) static struct drbd_epoch_entry * read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) { + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); struct drbd_epoch_entry *e; - struct bio_vec *bvec; struct page *page; - struct bio *bio; - int dgs, ds, i, rr; + int dgs, ds, rr; void *dig_in = mdev->int_dig_in; void *dig_vv = mdev->int_dig_vv; + unsigned long *data; dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; @@ -1286,29 +1380,44 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ ERR_IF(data_size & 0x1ff) return NULL; ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; + /* even though we trust out peer, + * we sometimes have to double check. */ + if (sector + (data_size>>9) > capacity) { + dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n", + (unsigned long long)capacity, + (unsigned long long)sector, data_size); + return NULL; + } + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); if (!e) return NULL; - bio = e->private_bio; + ds = data_size; - bio_for_each_segment(bvec, bio, i) { - page = bvec->bv_page; - rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); + page = e->pages; + page_chain_for_each(page) { + unsigned len = min_t(int, ds, PAGE_SIZE); + data = kmap(page); + rr = drbd_recv(mdev, data, len); + if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) { + dev_err(DEV, "Fault injection: Corrupting data on receive\n"); + data[0] = data[0] ^ (unsigned long)-1; + } kunmap(page); - if (rr != min_t(int, ds, PAGE_SIZE)) { + if (rr != len) { drbd_free_ee(mdev, e); dev_warn(DEV, "short read receiving data: read %d expected %d\n", - rr, min_t(int, ds, PAGE_SIZE)); + rr, len); return NULL; } ds -= rr; } if (dgs) { - drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); if (memcmp(dig_in, dig_vv, dgs)) { dev_err(DEV, "Digest integrity check FAILED.\n"); drbd_bcast_ee(mdev, "digest failed", @@ -1330,7 +1439,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size) int rr, rv = 1; void *data; - page = drbd_pp_alloc(mdev, 1); + if (!data_size) + return TRUE; + + page = drbd_pp_alloc(mdev, 1, 1); data = kmap(page); while (data_size) { @@ -1394,7 +1506,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, } if (dgs) { - drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv); if (memcmp(dig_in, dig_vv, dgs)) { dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); return 0; @@ -1415,7 +1527,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u D_ASSERT(hlist_unhashed(&e->colision)); - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { drbd_set_in_sync(mdev, sector, e->size); ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); } else { @@ -1434,30 +1546,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si struct drbd_epoch_entry *e; e = read_in_block(mdev, ID_SYNCER, sector, data_size); - if (!e) { - put_ldev(mdev); - return FALSE; - } + if (!e) + goto fail; dec_rs_pending(mdev); - e->private_bio->bi_end_io = drbd_endio_write_sec; - e->private_bio->bi_rw = WRITE; - e->w.cb = e_end_resync_block; - inc_unacked(mdev); /* corresponding dec_unacked() in e_end_resync_block() * respective _drbd_clear_done_ee */ + e->w.cb = e_end_resync_block; + spin_lock_irq(&mdev->req_lock); list_add(&e->w.list, &mdev->sync_ee); spin_unlock_irq(&mdev->req_lock); - drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); - /* accounting done in endio */ + if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) + return TRUE; - maybe_kick_lo(mdev); - return TRUE; + drbd_free_ee(mdev, e); +fail: + put_ldev(mdev); + return FALSE; } static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) @@ -1552,7 +1662,7 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { pcmd = (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T && e->flags & EE_MAY_SET_IN_SYNC) ? @@ -1698,7 +1808,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) return FALSE; } - e->private_bio->bi_end_io = drbd_endio_write_sec; e->w.cb = e_end_block; spin_lock(&mdev->epoch_lock); @@ -1894,12 +2003,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) drbd_al_begin_io(mdev, e->sector); } - e->private_bio->bi_rw = rw; - drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); - /* accounting done in endio */ - - maybe_kick_lo(mdev); - return TRUE; + if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) + return TRUE; out_interrupted: /* yes, the epoch_size now is imbalanced. @@ -1945,7 +2050,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) "no local data.\n"); drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : P_NEG_RS_DREPLY , p); - return TRUE; + return drbd_drain_block(mdev, h->length - brps); } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD @@ -1957,9 +2062,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) return FALSE; } - e->private_bio->bi_rw = READ; - e->private_bio->bi_end_io = drbd_endio_read_sec; - switch (h->command) { case P_DATA_REQUEST: e->w.cb = w_e_end_data_req; @@ -2053,10 +2155,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) inc_unacked(mdev); - drbd_generic_make_request(mdev, fault_type, e->private_bio); - maybe_kick_lo(mdev); - - return TRUE; + if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) + return TRUE; out_free_e: kfree(di); @@ -2473,6 +2573,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol hg > 0 ? "source" : "target"); } + if (abs(hg) == 100) + drbd_khelper(mdev, "initial-split-brain"); + if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { int pcount = (mdev->state.role == R_PRIMARY) + (peer_role == R_PRIMARY); @@ -2518,7 +2621,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol * after an attempted attach on a diskless node. * We just refuse to attach -- well, we drop the "connection" * to that disk, in a way... */ - dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); + dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n"); drbd_khelper(mdev, "split-brain"); return C_MASK; } @@ -2849,7 +2952,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) unsigned int max_seg_s; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ - enum drbd_conns nconn; + enum dds_flags ddsf; ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; if (drbd_recv(mdev, h->payload, h->length) != h->length) @@ -2905,8 +3008,9 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) } #undef min_not_zero + ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(mdev)) { - dd = drbd_determin_dev_size(mdev, 0); + dd = drbd_determin_dev_size(mdev, ddsf); put_ldev(mdev); if (dd == dev_size_error) return FALSE; @@ -2916,33 +3020,21 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) drbd_set_my_capacity(mdev, p_size); } - if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { - nconn = drbd_sync_handshake(mdev, - mdev->state.peer, mdev->state.pdsk); - put_ldev(mdev); - - if (nconn == C_MASK) { - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; - } - - if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; - } - } - if (get_ldev(mdev)) { if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); ldsc = 1; } - max_seg_s = be32_to_cpu(p->max_segment_size); + if (mdev->agreed_pro_version < 94) + max_seg_s = be32_to_cpu(p->max_segment_size); + else /* drbd 8.3.8 onwards */ + max_seg_s = DRBD_MAX_SEGMENT_SIZE; + if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) drbd_setup_queue_param(mdev, max_seg_s); - drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); + drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type)); put_ldev(mdev); } @@ -2951,14 +3043,17 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) drbd_get_capacity(mdev->this_bdev) || ldsc) { /* we have different sizes, probably peer * needs to know my new size... */ - drbd_send_sizes(mdev, 0); + drbd_send_sizes(mdev, 0, ddsf); } if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || (dd == grew && mdev->state.conn == C_CONNECTED)) { if (mdev->state.pdsk >= D_INCONSISTENT && - mdev->state.disk >= D_INCONSISTENT) - resync_after_online_grow(mdev); - else + mdev->state.disk >= D_INCONSISTENT) { + if (ddsf & DDSF_NO_RESYNC) + dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n"); + else + resync_after_online_grow(mdev); + } else set_bit(RESYNC_AFTER_NEG, &mdev->flags); } } @@ -3490,6 +3585,92 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) return TRUE; } +static void timeval_sub_us(struct timeval* tv, unsigned int us) +{ + tv->tv_sec -= us / 1000000; + us = us % 1000000; + if (tv->tv_usec > us) { + tv->tv_usec += 1000000; + tv->tv_sec--; + } + tv->tv_usec -= us; +} + +static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p) +{ + struct delay_probe *dp; + struct list_head *le; + struct timeval now; + int seq_num; + int offset; + int data_delay; + + seq_num = be32_to_cpu(p->seq_num); + offset = be32_to_cpu(p->offset); + + spin_lock(&mdev->peer_seq_lock); + if (!list_empty(&mdev->delay_probes)) { + if (from == USE_DATA_SOCKET) + le = mdev->delay_probes.next; + else + le = mdev->delay_probes.prev; + + dp = list_entry(le, struct delay_probe, list); + + if (dp->seq_num == seq_num) { + list_del(le); + spin_unlock(&mdev->peer_seq_lock); + do_gettimeofday(&now); + timeval_sub_us(&now, offset); + data_delay = + now.tv_usec - dp->time.tv_usec + + (now.tv_sec - dp->time.tv_sec) * 1000000; + + if (data_delay > 0) + mdev->data_delay = data_delay; + + kfree(dp); + return; + } + + if (dp->seq_num > seq_num) { + spin_unlock(&mdev->peer_seq_lock); + dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n"); + return; /* Do not alloca a struct delay_probe.... */ + } + } + spin_unlock(&mdev->peer_seq_lock); + + dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO); + if (!dp) { + dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n"); + return; + } + + dp->seq_num = seq_num; + do_gettimeofday(&dp->time); + timeval_sub_us(&dp->time, offset); + + spin_lock(&mdev->peer_seq_lock); + if (from == USE_DATA_SOCKET) + list_add(&dp->list, &mdev->delay_probes); + else + list_add_tail(&dp->list, &mdev->delay_probes); + spin_unlock(&mdev->peer_seq_lock); +} + +static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_delay_probe *p = (struct p_delay_probe *)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + got_delay_probe(mdev, USE_DATA_SOCKET, p); + return TRUE; +} + typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); static drbd_cmd_handler_f drbd_default_handler[] = { @@ -3513,6 +3694,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = { [P_OV_REQUEST] = receive_DataRequest, [P_OV_REPLY] = receive_DataRequest, [P_CSUM_RS_REQUEST] = receive_DataRequest, + [P_DELAY_PROBE] = receive_delay_probe, /* anything missing from this table is in * the asender_tbl, see get_asender_cmd */ [P_MAX_CMD] = NULL, @@ -3739,7 +3921,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) dev_info(DEV, "net_ee not empty, killed %u entries\n", i); i = atomic_read(&mdev->pp_in_use); if (i) - dev_info(DEV, "pp_in_use = %u, expected 0\n", i); + dev_info(DEV, "pp_in_use = %d, expected 0\n", i); D_ASSERT(list_empty(&mdev->read_ee)); D_ASSERT(list_empty(&mdev->active_ee)); @@ -4232,7 +4414,6 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); - D_ASSERT(p->block_id == ID_SYNCER); update_peer_seq(mdev, be32_to_cpu(p->seq_num)); @@ -4290,6 +4471,14 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) return TRUE; } +static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_delay_probe *p = (struct p_delay_probe *)h; + + got_delay_probe(mdev, USE_META_SOCKET, p); + return TRUE; +} + struct asender_cmd { size_t pkt_size; int (*process)(struct drbd_conf *mdev, struct p_header *h); @@ -4314,6 +4503,7 @@ static struct asender_cmd *get_asender_cmd(int cmd) [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, + [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_delay_probe_m }, [P_MAX_CMD] = { 0, NULL }, }; if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index de81ab7b4627..3397f11d0ba9 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -722,6 +722,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) struct drbd_request *req; int local, remote; int err = -EIO; + int ret = 0; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -784,7 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) (mdev->state.pdsk == D_INCONSISTENT && mdev->state.conn >= C_CONNECTED)); - if (!(local || remote)) { + if (!(local || remote) && !mdev->state.susp) { dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); goto fail_free_complete; } @@ -810,6 +811,16 @@ allocate_barrier: /* GOOD, everything prepared, grab the spin_lock */ spin_lock_irq(&mdev->req_lock); + if (mdev->state.susp) { + /* If we got suspended, use the retry mechanism of + generic_make_request() to restart processing of this + bio. In the next call to drbd_make_request_26 + we sleep in inc_ap_bio() */ + ret = 1; + spin_unlock_irq(&mdev->req_lock); + goto fail_free_complete; + } + if (remote) { remote = (mdev->state.pdsk == D_UP_TO_DATE || (mdev->state.pdsk == D_INCONSISTENT && @@ -947,12 +958,14 @@ fail_and_free_req: req->private_bio = NULL; put_ldev(mdev); } - bio_endio(bio, err); + if (!ret) + bio_endio(bio, err); + drbd_req_free(req); dec_ap_bio(mdev); kfree(b); - return 0; + return ret; } /* helper function for drbd_make_request @@ -962,11 +975,6 @@ fail_and_free_req: */ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) { - /* Unconfigured */ - if (mdev->state.conn == C_DISCONNECTING && - mdev->state.disk == D_DISKLESS) - return 1; - if (mdev->state.role != R_PRIMARY && (!allow_oos || is_write)) { if (__ratelimit(&drbd_ratelimit_state)) { @@ -1070,15 +1078,21 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio) /* we need to get a "reference count" (ap_bio_cnt) * to avoid races with the disconnect/reconnect/suspend code. - * In case we need to split the bio here, we need to get two references + * In case we need to split the bio here, we need to get three references * atomically, otherwise we might deadlock when trying to submit the * second one! */ - inc_ap_bio(mdev, 2); + inc_ap_bio(mdev, 3); D_ASSERT(e_enr == s_enr + 1); - drbd_make_request_common(mdev, &bp->bio1); - drbd_make_request_common(mdev, &bp->bio2); + while (drbd_make_request_common(mdev, &bp->bio1)) + inc_ap_bio(mdev, 1); + + while (drbd_make_request_common(mdev, &bp->bio2)) + inc_ap_bio(mdev, 1); + + dec_ap_bio(mdev); + bio_pair_release(bp); } return 0; @@ -1115,7 +1129,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct } else if (limit && get_ldev(mdev)) { struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; - if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { + if (b->merge_bvec_fn) { backing_limit = b->merge_bvec_fn(b, bvm, bvec); limit = min(limit, backing_limit); } diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 76863e3f05be..85179e1fb50a 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -70,7 +70,7 @@ static const char *drbd_disk_s_names[] = { static const char *drbd_state_sw_errors[] = { [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", - [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", + [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index d48a1dfd7b24..727ff6339754 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -47,8 +47,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca /* defined here: drbd_md_io_complete - drbd_endio_write_sec - drbd_endio_read_sec + drbd_endio_sec drbd_endio_pri * more endio handlers: @@ -85,27 +84,10 @@ void drbd_md_io_complete(struct bio *bio, int error) /* reads on behalf of the partner, * "submitted" by the receiver */ -void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) +void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) { unsigned long flags = 0; - struct drbd_epoch_entry *e = NULL; - struct drbd_conf *mdev; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - - e = bio->bi_private; - mdev = e->mdev; - - if (error) - dev_warn(DEV, "read: error=%d s=%llus\n", error, - (unsigned long long)e->sector); - if (!error && !uptodate) { - dev_warn(DEV, "read: setting error to -EIO s=%llus\n", - (unsigned long long)e->sector); - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - error = -EIO; - } + struct drbd_conf *mdev = e->mdev; D_ASSERT(e->block_id != ID_VACANT); @@ -114,49 +96,38 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) list_del(&e->w.list); if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); + if (test_bit(__EE_WAS_ERROR, &e->flags)) + __drbd_chk_io_error(mdev, FALSE); spin_unlock_irqrestore(&mdev->req_lock, flags); - drbd_chk_io_error(mdev, error, FALSE); drbd_queue_work(&mdev->data.work, &e->w); put_ldev(mdev); } +static int is_failed_barrier(int ee_flags) +{ + return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED)) + == (EE_IS_BARRIER|EE_WAS_ERROR); +} + /* writes on behalf of the partner, or resync writes, - * "submitted" by the receiver. - */ -void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) + * "submitted" by the receiver, final stage. */ +static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) { unsigned long flags = 0; - struct drbd_epoch_entry *e = NULL; - struct drbd_conf *mdev; + struct drbd_conf *mdev = e->mdev; sector_t e_sector; int do_wake; int is_syncer_req; int do_al_complete_io; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); - - e = bio->bi_private; - mdev = e->mdev; - if (error) - dev_warn(DEV, "write: error=%d s=%llus\n", error, - (unsigned long long)e->sector); - if (!error && !uptodate) { - dev_warn(DEV, "write: setting error to -EIO s=%llus\n", - (unsigned long long)e->sector); - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - error = -EIO; - } - - /* error == -ENOTSUPP would be a better test, - * alas it is not reliable */ - if (error && is_barrier && e->flags & EE_IS_BARRIER) { + /* if this is a failed barrier request, disable use of barriers, + * and schedule for resubmission */ + if (is_failed_barrier(e->flags)) { drbd_bump_write_ordering(mdev, WO_bdev_flush); spin_lock_irqsave(&mdev->req_lock, flags); list_del(&e->w.list); + e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED; e->w.cb = w_e_reissue; /* put_ldev actually happens below, once we come here again. */ __release(local); @@ -167,17 +138,16 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) D_ASSERT(e->block_id != ID_VACANT); - spin_lock_irqsave(&mdev->req_lock, flags); - mdev->writ_cnt += e->size >> 9; - is_syncer_req = is_syncer_block_id(e->block_id); - /* after we moved e to done_ee, * we may no longer access it, * it may be freed/reused already! * (as soon as we release the req_lock) */ e_sector = e->sector; do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; + is_syncer_req = is_syncer_block_id(e->block_id); + spin_lock_irqsave(&mdev->req_lock, flags); + mdev->writ_cnt += e->size >> 9; list_del(&e->w.list); /* has been on active_ee or sync_ee */ list_add_tail(&e->w.list, &mdev->done_ee); @@ -190,7 +160,7 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) ? list_empty(&mdev->sync_ee) : list_empty(&mdev->active_ee); - if (error) + if (test_bit(__EE_WAS_ERROR, &e->flags)) __drbd_chk_io_error(mdev, FALSE); spin_unlock_irqrestore(&mdev->req_lock, flags); @@ -205,7 +175,42 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) wake_asender(mdev); put_ldev(mdev); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +void drbd_endio_sec(struct bio *bio, int error) +{ + struct drbd_epoch_entry *e = bio->bi_private; + struct drbd_conf *mdev = e->mdev; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + int is_write = bio_data_dir(bio) == WRITE; + + if (error) + dev_warn(DEV, "%s: error=%d s=%llus\n", + is_write ? "write" : "read", error, + (unsigned long long)e->sector); + if (!error && !uptodate) { + dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", + is_write ? "write" : "read", + (unsigned long long)e->sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + if (error) + set_bit(__EE_WAS_ERROR, &e->flags); + bio_put(bio); /* no need for the bio anymore */ + if (atomic_dec_and_test(&e->pending_bios)) { + if (is_write) + drbd_endio_write_sec_final(e); + else + drbd_endio_read_sec_final(e); + } } /* read, readA or write requests on R_PRIMARY coming from drbd_make_request @@ -295,7 +300,34 @@ int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return 1; /* Simply ignore this! */ } -void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) +void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) +{ + struct hash_desc desc; + struct scatterlist sg; + struct page *page = e->pages; + struct page *tmp; + unsigned len; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + while ((tmp = page_chain_next(page))) { + /* all but the last page will be fully used */ + sg_set_page(&sg, page, PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + page = tmp; + } + /* and now the last, possibly only partially used page */ + len = e->size & (PAGE_SIZE - 1); + sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + crypto_hash_final(&desc, digest); +} + +void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) { struct hash_desc desc; struct scatterlist sg; @@ -329,11 +361,11 @@ static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel return 1; } - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { digest_size = crypto_hash_digestsize(mdev->csums_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (digest) { - drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); inc_rs_pending(mdev); ok = drbd_send_drequest_csum(mdev, @@ -369,23 +401,21 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); - if (!e) { - put_ldev(mdev); - return 2; - } + if (!e) + goto fail; spin_lock_irq(&mdev->req_lock); list_add(&e->w.list, &mdev->read_ee); spin_unlock_irq(&mdev->req_lock); - e->private_bio->bi_end_io = drbd_endio_read_sec; - e->private_bio->bi_rw = READ; e->w.cb = w_e_send_csum; + if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) + return 1; - mdev->read_cnt += size >> 9; - drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); - - return 1; + drbd_free_ee(mdev, e); +fail: + put_ldev(mdev); + return 2; } void resync_timer_fn(unsigned long data) @@ -414,13 +444,25 @@ void resync_timer_fn(unsigned long data) drbd_queue_work(&mdev->data.work, &mdev->resync_work); } +static int calc_resync_rate(struct drbd_conf *mdev) +{ + int d = mdev->data_delay / 1000; /* us -> ms */ + int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */ + int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */ + int cr = mdev->sync_conf.rate; + + return d <= td ? cr : + d >= hd ? 0 : + cr + (cr * (td - d) / (hd - td)); +} + int w_make_resync_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { unsigned long bit; sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); - int max_segment_size = queue_max_segment_size(mdev->rq_queue); + int max_segment_size; int number, i, size, pe, mx; int align, queued, sndbuf; @@ -446,7 +488,13 @@ int w_make_resync_request(struct drbd_conf *mdev, return 1; } - number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + /* starting with drbd 8.3.8, we can handle multi-bio EEs, + * if it should be necessary */ + max_segment_size = mdev->agreed_pro_version < 94 ? + queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; + + mdev->c_sync_rate = calc_resync_rate(mdev); + number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); pe = atomic_read(&mdev->rs_pending_cnt); mutex_lock(&mdev->data.mutex); @@ -509,12 +557,6 @@ next_sector: * * Additionally always align bigger requests, in order to * be prepared for all stripe sizes of software RAIDs. - * - * we _do_ care about the agreed-upon q->max_segment_size - * here, as splitting up the requests on the other side is more - * difficult. the consequence is, that on lvm and md and other - * "indirect" devices, this is dead code, since - * q->max_segment_size will be PAGE_SIZE. */ align = 1; for (;;) { @@ -806,7 +848,7 @@ out: /* helper */ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) { - if (drbd_bio_has_active_page(e->private_bio)) { + if (drbd_ee_has_active_page(e)) { /* This might happen if sendpage() has not finished */ spin_lock_irq(&mdev->req_lock); list_add_tail(&e->w.list, &mdev->net_ee); @@ -832,7 +874,7 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return 1; } - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { ok = drbd_send_block(mdev, P_DATA_REPLY, e); } else { if (__ratelimit(&drbd_ratelimit_state)) @@ -873,7 +915,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) put_ldev(mdev); } - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { inc_rs_pending(mdev); ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); @@ -921,7 +963,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) di = (struct digest_info *)(unsigned long)e->block_id; - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { /* quick hack to try to avoid a race against reconfiguration. * a real fix would be much more involved, * introducing more locking mechanisms */ @@ -931,7 +973,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) digest = kmalloc(digest_size, GFP_NOIO); } if (digest) { - drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); eq = !memcmp(digest, di->digest, digest_size); kfree(digest); } @@ -973,14 +1015,14 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) if (unlikely(cancel)) goto out; - if (unlikely(!drbd_bio_uptodate(e->private_bio))) + if (unlikely((e->flags & EE_WAS_ERROR) != 0)) goto out; digest_size = crypto_hash_digestsize(mdev->verify_tfm); /* FIXME if this allocation fails, online verify will not terminate! */ digest = kmalloc(digest_size, GFP_NOIO); if (digest) { - drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); + drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); inc_rs_pending(mdev); ok = drbd_send_drequest_csum(mdev, e->sector, e->size, digest, digest_size, P_OV_REPLY); @@ -1029,11 +1071,11 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) di = (struct digest_info *)(unsigned long)e->block_id; - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { digest_size = crypto_hash_digestsize(mdev->verify_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (digest) { - drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); + drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); D_ASSERT(digest_size == di->digest_size); eq = !memcmp(digest, di->digest, digest_size); diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h index f93fa111ce50..defdb5013ea3 100644 --- a/drivers/block/drbd/drbd_wrappers.h +++ b/drivers/block/drbd/drbd_wrappers.h @@ -18,23 +18,9 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) -static inline int drbd_bio_has_active_page(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - __bio_for_each_segment(bvec, bio, i, 0) { - if (page_count(bvec->bv_page) > 1) - return 1; - } - - return 0; -} - /* bi_end_io handlers */ extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_endio_read_sec(struct bio *bio, int error); -extern void drbd_endio_write_sec(struct bio *bio, int error); +extern void drbd_endio_sec(struct bio *bio, int error); extern void drbd_endio_pri(struct bio *bio, int error); /* |