diff options
Diffstat (limited to 'drivers')
35 files changed, 1218 insertions, 909 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 94dc0a235919..2a05d955e30b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -297,10 +297,6 @@ struct drbd_epoch { unsigned long flags; }; -/* Prototype declaration of function defined in drbd_receiver.c */ -int drbdd_init(struct drbd_thread *); -int drbd_asender(struct drbd_thread *); - /* drbd_epoch flag bits */ enum { DE_HAVE_BARRIER_NUMBER, @@ -864,7 +860,6 @@ struct drbd_device { struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ struct list_head net_ee; /* zero-copy network send in progress */ - int next_barrier_nr; struct list_head resync_reads; atomic_t pp_in_use; /* allocated from page pool */ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ @@ -1390,9 +1385,6 @@ extern void conn_free_crypto(struct drbd_connection *connection); extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *); void drbd_submit_bio(struct bio *bio); -extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); -extern int is_valid_ar_handle(struct drbd_request *, sector_t); - /* drbd_nl.c */ @@ -1474,7 +1466,6 @@ extern int w_resync_timer(struct drbd_work *, int); extern int w_send_write_hint(struct drbd_work *, int); extern int w_send_dblock(struct drbd_work *, int); extern int w_send_read_req(struct drbd_work *, int); -extern int w_e_reissue(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); @@ -1488,7 +1479,6 @@ extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); -extern void drbd_send_ping_wf(struct work_struct *ws); extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, @@ -1504,7 +1494,6 @@ extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); -extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a9e49b212341..449123eb54bf 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1550,7 +1550,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa * put_page(); and would cause either a VM_BUG directly, or * __page_cache_release a page that would actually still be referenced * by someone, leading to some obscure delayed Oops somewhere else. */ - if (!drbd_disable_sendpage && sendpage_ok(page)) + if (!drbd_disable_sendpage && sendpages_ok(page, len, offset)) msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES; drbd_update_congested(peer_device->connection); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index e858e7e0383f..c2b6c4d9729d 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -876,7 +876,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) ns.disk == D_OUTDATED) rv = SS_CONNECTED_OUTDATES; - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && (nc->verify_alg[0] == 0)) rv = SS_NO_VERIFY_ALG; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c6ef0546ffc9..11901f2812ad 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2269,25 +2269,12 @@ static const struct file_operations mtip_flags_fops = { .llseek = no_llseek, }; -static int mtip_hw_debugfs_init(struct driver_data *dd) +static void mtip_hw_debugfs_init(struct driver_data *dd) { - if (!dfs_parent) - return -1; - dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); - if (IS_ERR_OR_NULL(dd->dfs_node)) { - dev_warn(&dd->pdev->dev, - "Error creating node %s under debugfs\n", - dd->disk->disk_name); - dd->dfs_node = NULL; - return -1; - } - debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops); debugfs_create_file("registers", 0444, dd->dfs_node, dd, &mtip_regs_fops); - - return 0; } static void mtip_hw_debugfs_exit(struct driver_data *dd) @@ -4043,10 +4030,6 @@ static int __init mtip_init(void) mtip_major = error; dfs_parent = debugfs_create_dir("rssd", NULL); - if (IS_ERR_OR_NULL(dfs_parent)) { - pr_warn("Error creating debugfs parent\n"); - dfs_parent = NULL; - } /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 41a90150b501..b852050d8a96 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -181,6 +181,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd) { struct request *req = blk_mq_rq_from_pdu(cmd); + lockdep_assert_held(&cmd->lock); + + /* + * Clear INFLIGHT flag so that this cmd won't be completed in + * normal completion path + * + * INFLIGHT flag will be set when the cmd is queued to nbd next + * time. + */ + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); + if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) blk_mq_requeue_request(req, true); } @@ -339,7 +350,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim = queue_limits_start_update(nbd->disk->queue); if (nbd->config->flags & NBD_FLAG_SEND_TRIM) - lim.max_hw_discard_sectors = UINT_MAX; + lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT; else lim.max_hw_discard_sectors = 0; if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { @@ -350,6 +361,11 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.features |= BLK_FEAT_WRITE_CACHE; lim.features &= ~BLK_FEAT_FUA; } + if (nbd->config->flags & NBD_FLAG_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES) + lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT; + lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update(nbd->disk->queue, &lim); @@ -418,6 +434,8 @@ static u32 req_to_nbd_cmd_type(struct request *req) return NBD_CMD_WRITE; case REQ_OP_READ: return NBD_CMD_READ; + case REQ_OP_WRITE_ZEROES: + return NBD_CMD_WRITE_ZEROES; default: return U32_MAX; } @@ -488,8 +506,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); } - mutex_unlock(&cmd->lock); nbd_requeue_cmd(cmd); + mutex_unlock(&cmd->lock); nbd_config_put(nbd); return BLK_EH_DONE; } @@ -634,6 +652,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, if (req->cmd_flags & REQ_FUA) nbd_cmd_flags |= NBD_CMD_FLAG_FUA; + if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES)) + nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE; /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the @@ -1703,6 +1723,10 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); + if (flags & NBD_FLAG_SEND_WRITE_ZEROES) + seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); + if (flags & NBD_FLAG_ROTATIONAL) + seq_puts(s, "NBD_FLAG_ROTATIONAL\n"); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7cece5884b9c..3edb37a41312 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -498,8 +498,6 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) if (!pkt_debugfs_root) return; pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); - if (!pd->dfs_d_root) - return; pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, pd, &pkt_seq_fops); diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index f6e3a3c4b76c..08ce6d96d04c 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -149,15 +149,22 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); if (bio_add_page(bio, virt_to_page(data), datalen, offset_in_page(data)) != datalen) { - rnbd_srv_err(sess_dev, "Failed to map data to bio\n"); + rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n"); err = -EINVAL; goto bio_put; } + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + if (bio_has_data(bio) && + bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { + rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", + bio->bi_iter.bi_size, msg->bi_size); + err = -EINVAL; + goto bio_put; + } bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); - bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); bio_set_prio(bio, prio); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1d53a3f48a0e..bca06bfb4bc3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -71,9 +71,6 @@ struct ublk_rq_data { struct llist_node node; struct kref ref; - __u64 sector; - __u32 operation; - __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -214,6 +211,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) #ifdef CONFIG_BLK_DEV_ZONED +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + static int ublk_get_nr_zones(const struct ublk_device *ub) { const struct ublk_param_basic *p = &ub->params.basic; @@ -308,7 +332,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, unsigned int zones_in_request = min_t(unsigned int, remaining_zones, max_zones_per_request); struct request *req; - struct ublk_rq_data *pdu; + struct ublk_zoned_report_desc desc; blk_status_t status; memset(buffer, 0, buffer_length); @@ -319,20 +343,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, goto out; } - pdu = blk_mq_rq_to_pdu(req); - pdu->operation = UBLK_IO_OP_REPORT_ZONES; - pdu->sector = sector; - pdu->nr_zones = zones_in_request; + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, GFP_KERNEL); - if (ret) { - blk_mq_free_request(req); - goto out; - } + if (ret) + goto erase_desc; status = blk_execute_rq(req, 0); ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: blk_mq_free_request(req); if (ret) goto out; @@ -366,7 +393,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + struct ublk_zoned_report_desc *desc; u32 ublk_op; switch (req_op(req)) { @@ -389,12 +416,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - ublk_op = pdu->operation; + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; switch (ublk_op) { case UBLK_IO_OP_REPORT_ZONES: iod->op_flags = ublk_op | ublk_req_build_flags(req); - iod->nr_zones = pdu->nr_zones; - iod->start_sector = pdu->sector; + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; return BLK_STS_OK; default: return BLK_STS_IOERR; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index efcb8d9d274c..84cd62efac0f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,17 +59,17 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, static int zram_slot_trylock(struct zram *zram, u32 index) { - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); + return spin_trylock(&zram->table[index].lock); } static void zram_slot_lock(struct zram *zram, u32 index) { - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); + spin_lock(&zram->table[index].lock); } static void zram_slot_unlock(struct zram *zram, u32 index) { - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); } static inline bool init_done(struct zram *zram) @@ -1211,7 +1211,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) static bool zram_meta_alloc(struct zram *zram, u64 disksize) { - size_t num_pages; + size_t num_pages, index; num_pages = disksize >> PAGE_SHIFT; zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table))); @@ -1226,6 +1226,9 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); return true; } @@ -1283,7 +1286,7 @@ out: zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); WARN_ON_ONCE(zram->table[index].flags & - ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); + ~(1UL << ZRAM_UNDER_WB)); } /* @@ -2401,9 +2404,10 @@ static void destroy_devices(void) static int __init zram_init(void) { + struct zram_table_entry zram_te; int ret; - BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 35e322144629..531cefc66668 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -45,9 +45,7 @@ /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { - /* zram slot is locked */ - ZRAM_LOCK = ZRAM_FLAG_SHIFT, - ZRAM_SAME, /* Page consists the same element */ + ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ @@ -68,7 +66,8 @@ struct zram_table_entry { unsigned long handle; unsigned long element; }; - unsigned long flags; + unsigned int flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; #endif diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 0c3323e0adb2..63682d27fc8d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3949,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) /* Try loading the bitmap unless "raid0", which does not have one */ if (!rs_is_raid0(rs) && !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { - r = md_bitmap_load(&rs->md); + struct mddev *mddev = &rs->md; + + r = mddev->bitmap_ops->load(mddev); if (r) DMERR("Failed to load bitmap"); } @@ -4066,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize, false); if (r) DMERR("Failed to resize bitmap"); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index db5330d97348..29da10e6f703 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -32,11 +32,210 @@ #include "md.h" #include "md-bitmap.h" +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stripe is clean + * Anything that dirties the stripe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SHIFT 9 + +/* + * bitmap structures: + */ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * If any counter in this page is '1' or '2' - and so could be + * cleared then that page is marked as 'pending' + */ + unsigned int pending:1; + /* + * count of dirty bits on the page + */ + unsigned int count:30; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + + struct bitmap_counts { + spinlock_t lock; + struct bitmap_page *bp; + /* total number of pages in the bitmap */ + unsigned long pages; + /* number of pages not yet allocated */ + unsigned long missing_pages; + /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunkshift; + /* total number of data chunks for the array */ + unsigned long chunks; + } counts; + + struct mddev *mddev; /* the md device that the bitmap is for */ + + __u64 events_cleared; + int need_sync; + + struct bitmap_storage { + /* backing disk file */ + struct file *file; + /* cached copy of the bitmap file superblock */ + struct page *sb_page; + unsigned long sb_index; + /* list of cache pages for the file */ + struct page **filemap; + /* attributes associated filemap pages */ + unsigned long *filemap_attr; + /* number of pages in the file */ + unsigned long file_pages; + /* total bytes in the bitmap */ + unsigned long bytes; + } storage; + + unsigned long flags; + + int allclean; + + atomic_t behind_writes; + /* highest actual value at runtime */ + unsigned long behind_writes_used; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + /* + * when we lasted called end_sync to update bitmap with resync + * progress. + */ + unsigned long last_end_sync; + + /* pending writes to the bitmap file */ + atomic_t pending_writes; + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; + + struct kernfs_node *sysfs_can_clear; + /* slot offset for clustered env */ + int cluster_slot; +}; + +static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, bool init); + static inline char *bmname(struct bitmap *bitmap) { return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } +static bool __bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + +static bool bitmap_enabled(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return false; + + return __bitmap_enabled(bitmap); +} + /* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * @@ -472,9 +671,10 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap) /* update the event counter and sync the superblock to disk */ -void md_bitmap_update_sb(struct bitmap *bitmap) +static void bitmap_update_sb(void *data) { bitmap_super_t *sb; + struct bitmap *bitmap = data; if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; @@ -510,10 +710,8 @@ void md_bitmap_update_sb(struct bitmap *bitmap) write_sb_page(bitmap, bitmap->storage.sb_index, bitmap->storage.sb_page, 1); } -EXPORT_SYMBOL(md_bitmap_update_sb); -/* print out the bitmap file superblock */ -void md_bitmap_print_sb(struct bitmap *bitmap) +static void bitmap_print_sb(struct bitmap *bitmap) { bitmap_super_t *sb; @@ -760,7 +958,7 @@ out_no_sb: bitmap->mddev->bitmap_info.space > sectors_reserved) bitmap->mddev->bitmap_info.space = sectors_reserved; } else { - md_bitmap_print_sb(bitmap); + bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); } @@ -893,7 +1091,7 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store) static void md_bitmap_file_kick(struct bitmap *bitmap) { if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); if (bitmap->storage.file) { pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", @@ -1028,13 +1226,13 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ -void md_bitmap_unplug(struct bitmap *bitmap) +static void __bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; int writing = 0; - if (!md_bitmap_enabled(bitmap)) + if (!__bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be @@ -1060,7 +1258,6 @@ void md_bitmap_unplug(struct bitmap *bitmap) if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) md_bitmap_file_kick(bitmap); } -EXPORT_SYMBOL(md_bitmap_unplug); struct bitmap_unplug_work { struct work_struct work; @@ -1073,11 +1270,11 @@ static void md_bitmap_unplug_fn(struct work_struct *work) struct bitmap_unplug_work *unplug_work = container_of(work, struct bitmap_unplug_work, work); - md_bitmap_unplug(unplug_work->bitmap); + __bitmap_unplug(unplug_work->bitmap); complete(unplug_work->done); } -void md_bitmap_unplug_async(struct bitmap *bitmap) +static void bitmap_unplug_async(struct bitmap *bitmap) { DECLARE_COMPLETION_ONSTACK(done); struct bitmap_unplug_work unplug_work; @@ -1089,7 +1286,19 @@ void md_bitmap_unplug_async(struct bitmap *bitmap) queue_work(md_bitmap_wq, &unplug_work.work); wait_for_completion(&done); } -EXPORT_SYMBOL(md_bitmap_unplug_async); + +static void bitmap_unplug(struct mddev *mddev, bool sync) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; + + if (sync) + __bitmap_unplug(bitmap); + else + bitmap_unplug_async(bitmap); +} static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); @@ -1226,22 +1435,21 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) return ret; } -void md_bitmap_write_all(struct bitmap *bitmap) +/* just flag bitmap pages as needing to be written. */ +static void bitmap_write_all(struct mddev *mddev) { - /* We don't actually write all bitmap blocks here, - * just flag them as needing to be written - */ int i; + struct bitmap *bitmap = mddev->bitmap; if (!bitmap || !bitmap->storage.filemap) return; + + /* Only one copy, so nothing needed */ if (bitmap->storage.file) - /* Only one copy, so nothing needed */ return; for (i = 0; i < bitmap->storage.file_pages; i++) - set_page_attr(bitmap, i, - BITMAP_PAGE_NEEDWRITE); + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); bitmap->allclean = 0; } @@ -1290,7 +1498,7 @@ out: * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ -void md_bitmap_daemon_work(struct mddev *mddev) +static void bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; unsigned long j; @@ -1461,8 +1669,11 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +static int bitmap_startwrite(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool behind) { + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return 0; @@ -1523,13 +1734,15 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s } return 0; } -EXPORT_SYMBOL(md_bitmap_startwrite); -void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind) +static void bitmap_endwrite(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool success, bool behind) { + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return; + if (behind) { if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); @@ -1576,26 +1789,27 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, sectors = 0; } } -EXPORT_SYMBOL(md_bitmap_endwrite); -static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) +static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - int rv; + bool rv; + if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ *blocks = 1024; - return 1; /* always resync if no bitmap */ + return true; /* always resync if no bitmap */ } spin_lock_irq(&bitmap->counts.lock); + + rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); - rv = 0; if (bmc) { /* locked */ - if (RESYNC(*bmc)) - rv = 1; - else if (NEEDED(*bmc)) { - rv = 1; + if (RESYNC(*bmc)) { + rv = true; + } else if (NEEDED(*bmc)) { + rv = true; if (!degraded) { /* don't set/clear bits if degraded */ *bmc |= RESYNC_MASK; *bmc &= ~NEEDED_MASK; @@ -1603,11 +1817,12 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t } } spin_unlock_irq(&bitmap->counts.lock); + return rv; } -int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) +static bool bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) { /* bitmap_start_sync must always report on multiples of whole * pages, otherwise resync (which is very PAGE_SIZE based) will @@ -1616,21 +1831,22 @@ int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *block * At least PAGE_SIZE>>9 blocks are covered. * Return the 'or' of the result. */ - int rv = 0; + bool rv = false; sector_t blocks1; *blocks = 0; while (*blocks < (PAGE_SIZE>>9)) { - rv |= __bitmap_start_sync(bitmap, offset, + rv |= __bitmap_start_sync(mddev->bitmap, offset, &blocks1, degraded); offset += blocks1; *blocks += blocks1; } + return rv; } -EXPORT_SYMBOL(md_bitmap_start_sync); -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) +static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks, bool aborted) { bitmap_counter_t *bmc; unsigned long flags; @@ -1659,9 +1875,14 @@ void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks unlock: spin_unlock_irqrestore(&bitmap->counts.lock, flags); } -EXPORT_SYMBOL(md_bitmap_end_sync); -void md_bitmap_close_sync(struct bitmap *bitmap) +static void bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + __bitmap_end_sync(mddev->bitmap, offset, blocks, true); +} + +static void bitmap_close_sync(struct mddev *mddev) { /* Sync has finished, and any bitmap chunks that weren't synced * properly have been aborted. It remains to us to clear the @@ -1669,19 +1890,23 @@ void md_bitmap_close_sync(struct bitmap *bitmap) */ sector_t sector = 0; sector_t blocks; + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return; + while (sector < bitmap->mddev->resync_max_sectors) { - md_bitmap_end_sync(bitmap, sector, &blocks, 0); + __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } } -EXPORT_SYMBOL(md_bitmap_close_sync); -void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) +static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) { sector_t s = 0; sector_t blocks; + struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; @@ -1700,34 +1925,32 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { - md_bitmap_end_sync(bitmap, s, &blocks, 0); + __bitmap_end_sync(bitmap, s, &blocks, false); s += blocks; } bitmap->last_end_sync = jiffies; sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } -EXPORT_SYMBOL(md_bitmap_cond_end_sync); -void md_bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi) +static void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi) { struct bitmap *bitmap = mddev->bitmap; sector_t sector, blocks = 0; for (sector = old_lo; sector < new_lo; ) { - md_bitmap_end_sync(bitmap, sector, &blocks, 0); + __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); for (sector = old_hi; sector < new_hi; ) { - md_bitmap_start_sync(bitmap, sector, &blocks, 0); + bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); } -EXPORT_SYMBOL(md_bitmap_sync_with_cluster); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { @@ -1756,12 +1979,18 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ -void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) { unsigned long chunk; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; for (chunk = s; chunk <= e; chunk++) { sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; + md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); if (sec < bitmap->mddev->recovery_cp) @@ -1773,10 +2002,7 @@ void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long } } -/* - * flush out any pending updates - */ -void md_bitmap_flush(struct mddev *mddev) +static void bitmap_flush(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; long sleep; @@ -1789,23 +2015,21 @@ void md_bitmap_flush(struct mddev *mddev) */ sleep = mddev->bitmap_info.daemon_sleep * 2; bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); if (mddev->bitmap_info.external) md_super_wait(mddev); - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); } -/* - * free memory that was allocated - */ -void md_bitmap_free(struct bitmap *bitmap) +static void md_bitmap_free(void *data) { unsigned long k, pages; struct bitmap_page *bp; + struct bitmap *bitmap = data; if (!bitmap) /* there was no bitmap */ return; @@ -1836,9 +2060,8 @@ void md_bitmap_free(struct bitmap *bitmap) kfree(bp); kfree(bitmap); } -EXPORT_SYMBOL(md_bitmap_free); -void md_bitmap_wait_behind_writes(struct mddev *mddev) +static void bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -1852,14 +2075,14 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev) } } -void md_bitmap_destroy(struct mddev *mddev) +static void bitmap_destroy(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) /* there was no bitmap */ return; - md_bitmap_wait_behind_writes(mddev); + bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy) mddev_destroy_serial_pool(mddev, NULL); @@ -1878,7 +2101,7 @@ void md_bitmap_destroy(struct mddev *mddev) * if this returns an error, bitmap_destroy must be called to do clean up * once mddev->bitmap is set */ -struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) +static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; @@ -1948,7 +2171,8 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) goto error; bitmap->daemon_lastrun = jiffies; - err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, + true); if (err) goto error; @@ -1965,7 +2189,18 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) return ERR_PTR(err); } -int md_bitmap_load(struct mddev *mddev) +static int bitmap_create(struct mddev *mddev, int slot) +{ + struct bitmap *bitmap = __bitmap_create(mddev, slot); + + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + mddev->bitmap = bitmap; + return 0; +} + +static int bitmap_load(struct mddev *mddev) { int err = 0; sector_t start = 0; @@ -1989,10 +2224,10 @@ int md_bitmap_load(struct mddev *mddev) */ while (sector < mddev->resync_max_sectors) { sector_t blocks; - md_bitmap_start_sync(bitmap, sector, &blocks, 0); + bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } - md_bitmap_close_sync(bitmap); + bitmap_close_sync(mddev); if (mddev->degraded == 0 || bitmap->events_cleared == mddev->events) @@ -2014,22 +2249,21 @@ int md_bitmap_load(struct mddev *mddev) mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) err = -EIO; out: return err; } -EXPORT_SYMBOL_GPL(md_bitmap_load); /* caller need to free returned bitmap with md_bitmap_free() */ -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) +static void *bitmap_get_from_slot(struct mddev *mddev, int slot) { int rv = 0; struct bitmap *bitmap; - bitmap = md_bitmap_create(mddev, slot); + bitmap = __bitmap_create(mddev, slot); if (IS_ERR(bitmap)) { rv = PTR_ERR(bitmap); return ERR_PTR(rv); @@ -2043,20 +2277,19 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) return bitmap; } -EXPORT_SYMBOL(get_bitmap_from_slot); /* Loads the bitmap associated with slot and copies the resync information * to our bitmap */ -int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *low, sector_t *high, bool clear_bits) +static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, + sector_t *high, bool clear_bits) { int rv = 0, i, j; sector_t block, lo = 0, hi = 0; struct bitmap_counts *counts; struct bitmap *bitmap; - bitmap = get_bitmap_from_slot(mddev, slot); + bitmap = bitmap_get_from_slot(mddev, slot); if (IS_ERR(bitmap)) { pr_err("%s can't get bitmap from slot %d\n", __func__, slot); return -1; @@ -2076,53 +2309,59 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, } if (clear_bits) { - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); - md_bitmap_unplug(bitmap); + __bitmap_unplug(bitmap); } - md_bitmap_unplug(mddev->bitmap); + __bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; md_bitmap_free(bitmap); return rv; } -EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); +static void bitmap_set_pages(void *data, unsigned long pages) +{ + struct bitmap *bitmap = data; + + bitmap->counts.pages = pages; +} -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats) { - unsigned long chunk_kb; + struct bitmap_storage *storage; struct bitmap_counts *counts; + struct bitmap *bitmap = data; + bitmap_super_t *sb; if (!bitmap) - return; + return -ENOENT; + + sb = kmap_local_page(bitmap->storage.sb_page); + stats->sync_size = le64_to_cpu(sb->sync_size); + kunmap_local(sb); counts = &bitmap->counts; + stats->missing_pages = counts->missing_pages; + stats->pages = counts->pages; - chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - counts->pages - counts->missing_pages, - counts->pages, - (counts->pages - counts->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->storage.file) { - seq_printf(seq, ", file: "); - seq_file_path(seq, bitmap->storage.file, " \t\n"); - } + storage = &bitmap->storage; + stats->file_pages = storage->file_pages; + stats->file = storage->file; - seq_printf(seq, "\n"); + stats->behind_writes = atomic_read(&bitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait); + stats->events_cleared = bitmap->events_cleared; + return 0; } -int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init) +static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, bool init) { /* If chunk_size is 0, choose an appropriate chunk size. * Then possibly allocate new storage space. @@ -2320,14 +2559,24 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, spin_unlock_irq(&bitmap->counts.lock); if (!init) { - md_bitmap_unplug(bitmap); + __bitmap_unplug(bitmap); bitmap->mddev->pers->quiesce(bitmap->mddev, 0); } ret = 0; err: return ret; } -EXPORT_SYMBOL_GPL(md_bitmap_resize); + +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, + bool init) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return 0; + + return __bitmap_resize(bitmap, blocks, chunksize, init); +} static ssize_t location_show(struct mddev *mddev, char *page) @@ -2367,7 +2616,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) goto out; } - md_bitmap_destroy(mddev); + bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; @@ -2377,7 +2626,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } else { /* No bitmap, OK to set a location */ long long offset; - struct bitmap *bitmap; if (strncmp(buf, "none", 4) == 0) /* nothing to be done */; @@ -2404,17 +2652,14 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } mddev->bitmap_info.offset = offset; - bitmap = md_bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) { - rv = PTR_ERR(bitmap); + rv = bitmap_create(mddev, -1); + if (rv) goto out; - } - mddev->bitmap = bitmap; - rv = md_bitmap_load(mddev); + rv = bitmap_load(mddev); if (rv) { mddev->bitmap_info.offset = 0; - md_bitmap_destroy(mddev); + bitmap_destroy(mddev); goto out; } } @@ -2450,6 +2695,7 @@ space_show(struct mddev *mddev, char *page) static ssize_t space_store(struct mddev *mddev, const char *buf, size_t len) { + struct bitmap *bitmap; unsigned long sectors; int rv; @@ -2460,8 +2706,8 @@ space_store(struct mddev *mddev, const char *buf, size_t len) if (sectors == 0) return -EINVAL; - if (mddev->bitmap && - sectors < (mddev->bitmap->storage.bytes + 511) >> 9) + bitmap = mddev->bitmap; + if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9) return -EFBIG; /* Bitmap is too big for this small space */ /* could make sure it isn't too big, but that isn't really @@ -2569,7 +2815,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev_create_serial_pool(mddev, rdev); } if (old_mwb != backlog) - md_bitmap_update_sb(mddev->bitmap); + bitmap_update_sb(mddev->bitmap); mddev_unlock_and_resume(mddev); return len; @@ -2638,10 +2884,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; + struct bitmap *bitmap; + spin_lock(&mddev->lock); - if (mddev->bitmap) - len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? - "false" : "true")); + bitmap = mddev->bitmap; + if (bitmap) + len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" : + "true")); else len = sprintf(page, "\n"); spin_unlock(&mddev->lock); @@ -2650,17 +2899,24 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page) static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) { - if (mddev->bitmap == NULL) + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) return -ENOENT; - if (strncmp(buf, "false", 5) == 0) - mddev->bitmap->need_sync = 1; - else if (strncmp(buf, "true", 4) == 0) { + + if (strncmp(buf, "false", 5) == 0) { + bitmap->need_sync = 1; + return len; + } + + if (strncmp(buf, "true", 4) == 0) { if (mddev->degraded) return -EBUSY; - mddev->bitmap->need_sync = 0; - } else - return -EINVAL; - return len; + bitmap->need_sync = 0; + return len; + } + + return -EINVAL; } static struct md_sysfs_entry bitmap_can_clear = @@ -2670,21 +2926,26 @@ static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { ssize_t ret; + struct bitmap *bitmap; + spin_lock(&mddev->lock); - if (mddev->bitmap == NULL) + bitmap = mddev->bitmap; + if (!bitmap) ret = sprintf(page, "0\n"); else - ret = sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); + ret = sprintf(page, "%lu\n", bitmap->behind_writes_used); spin_unlock(&mddev->lock); + return ret; } static ssize_t behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) { - if (mddev->bitmap) - mddev->bitmap->behind_writes_used = 0; + struct bitmap *bitmap = mddev->bitmap; + + if (bitmap) + bitmap->behind_writes_used = 0; return len; } @@ -2707,3 +2968,38 @@ const struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; + +static struct bitmap_operations bitmap_ops = { + .enabled = bitmap_enabled, + .create = bitmap_create, + .resize = bitmap_resize, + .load = bitmap_load, + .destroy = bitmap_destroy, + .flush = bitmap_flush, + .write_all = bitmap_write_all, + .dirty_bits = bitmap_dirty_bits, + .unplug = bitmap_unplug, + .daemon_work = bitmap_daemon_work, + .wait_behind_writes = bitmap_wait_behind_writes, + + .startwrite = bitmap_startwrite, + .endwrite = bitmap_endwrite, + .start_sync = bitmap_start_sync, + .end_sync = bitmap_end_sync, + .cond_end_sync = bitmap_cond_end_sync, + .close_sync = bitmap_close_sync, + + .update_sb = bitmap_update_sb, + .get_stats = bitmap_get_stats, + + .sync_with_cluster = bitmap_sync_with_cluster, + .get_from_slot = bitmap_get_from_slot, + .copy_from_slot = bitmap_copy_from_slot, + .set_pages = bitmap_set_pages, + .free = md_bitmap_free, +}; + +void mddev_set_bitmap_ops(struct mddev *mddev) +{ + mddev->bitmap_ops = &bitmap_ops; +} diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index bb9eb418780a..662e6fc141a7 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -7,81 +7,7 @@ #ifndef BITMAP_H #define BITMAP_H 1 -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - -/* - * in-memory bitmap: - * - * Use 16 bit block counters to track pending writes to each "chunk". - * The 2 high order bits are special-purpose, the first is a flag indicating - * whether a resync is needed. The second is a flag indicating whether a - * resync is active. - * This means that the counter is actually 14 bits: - * - * +--------+--------+------------------------------------------------+ - * | resync | resync | counter | - * | needed | active | | - * | (0-1) | (0-1) | (0-16383) | - * +--------+--------+------------------------------------------------+ - * - * The "resync needed" bit is set when: - * a '1' bit is read from storage at startup. - * a write request fails on some drives - * a resync is aborted on a chunk with 'resync active' set - * It is cleared (and resync-active set) when a resync starts across all drives - * of the chunk. - * - * - * The "resync active" bit is set when: - * a resync is started on all drives, and resync_needed is set. - * resync_needed will be cleared (as long as resync_active wasn't already set). - * It is cleared when a resync completes. - * - * The counter counts pending write requests, plus the on-disk bit. - * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared as well, thus setting the counter to 0. - * When we set a bit, or in the counter (to start a write), if the fields is - * 0, we first set the disk bit and set the counter to 1. - * - * If the counter is 0, the on-disk bit is clear and the stripe is clean - * Anything that dirties the stripe pushes the counter to 2 (at least) - * and sets the on-disk bit (lazily). - * If a periodic sweep find the counter at 2, it is decremented to 1. - * If the sweep find the counter at 1, the on-disk bit is cleared and the - * counter goes to zero. - * - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block - * counters as a fallback when "page" memory cannot be allocated: - * - * Normal case (page memory allocated): - * - * page pointer (32-bit) - * - * [ ] ------+ - * | - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) - * c1 c2 c2048 - * - * Hijacked case (page memory allocation failed): - * - * hijacked page pointer (32-bit) - * - * [ ][ ] (no page memory allocated) - * counter #1 (16-bit) counter #2 (16-bit) - * - */ - -#ifdef __KERNEL__ - -#define PAGE_BITS (PAGE_SIZE << 3) -#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) +#define BITMAP_MAGIC 0x6d746962 typedef __u16 bitmap_counter_t; #define COUNTER_BITS 16 @@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t; #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) -#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) -#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) - -/* how many counters per page? */ -#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) -/* same, except a shift value for more efficient bitops */ -#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) -/* same, except a mask value for more efficient bitops */ -#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) - -#define BITMAP_BLOCK_SHIFT 9 - -#endif - -/* - * bitmap structures: - */ - -#define BITMAP_MAGIC 0x6d746962 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { @@ -152,136 +58,58 @@ typedef struct bitmap_super_s { * devices. For raid10 it is the size of the array. */ -#ifdef __KERNEL__ +struct md_bitmap_stats { + u64 events_cleared; + int behind_writes; + bool behind_wait; -/* the in-memory bitmap is represented by bitmap_pages */ -struct bitmap_page { - /* - * map points to the actual memory page - */ - char *map; - /* - * in emergencies (when map cannot be alloced), hijack the map - * pointer and use it as two counters itself - */ - unsigned int hijacked:1; - /* - * If any counter in this page is '1' or '2' - and so could be - * cleared then that page is marked as 'pending' - */ - unsigned int pending:1; - /* - * count of dirty bits on the page - */ - unsigned int count:30; + unsigned long missing_pages; + unsigned long file_pages; + unsigned long sync_size; + unsigned long pages; + struct file *file; }; -/* the main bitmap structure - one per mddev */ -struct bitmap { - - struct bitmap_counts { - spinlock_t lock; - struct bitmap_page *bp; - unsigned long pages; /* total number of pages - * in the bitmap */ - unsigned long missing_pages; /* number of pages - * not yet allocated */ - unsigned long chunkshift; /* chunksize = 2^chunkshift - * (for bitops) */ - unsigned long chunks; /* Total number of data - * chunks for the array */ - } counts; - - struct mddev *mddev; /* the md device that the bitmap is for */ - - __u64 events_cleared; - int need_sync; - - struct bitmap_storage { - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap - * file superblock */ - unsigned long sb_index; - struct page **filemap; /* list of cache pages for - * the file */ - unsigned long *filemap_attr; /* attributes associated - * w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file*/ - unsigned long bytes; /* total bytes in the bitmap */ - } storage; - - unsigned long flags; - - int allclean; - - atomic_t behind_writes; - unsigned long behind_writes_used; /* highest actual value at runtime */ - - /* - * the bitmap daemon - periodically wakes up and sweeps the bitmap - * file, cleaning up bits and flushing out pages to disk as necessary - */ - unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long last_end_sync; /* when we lasted called end_sync to - * update bitmap with resync progress */ - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; - wait_queue_head_t overflow_wait; - wait_queue_head_t behind_wait; - - struct kernfs_node *sysfs_can_clear; - int cluster_slot; /* Slot offset for clustered env */ +struct bitmap_operations { + bool (*enabled)(struct mddev *mddev); + int (*create)(struct mddev *mddev, int slot); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, + bool init); + + int (*load)(struct mddev *mddev); + void (*destroy)(struct mddev *mddev); + void (*flush)(struct mddev *mddev); + void (*write_all)(struct mddev *mddev); + void (*dirty_bits)(struct mddev *mddev, unsigned long s, + unsigned long e); + void (*unplug)(struct mddev *mddev, bool sync); + void (*daemon_work)(struct mddev *mddev); + void (*wait_behind_writes)(struct mddev *mddev); + + int (*startwrite)(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool behind); + void (*endwrite)(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool success, bool behind); + bool (*start_sync)(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded); + void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); + void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force); + void (*close_sync)(struct mddev *mddev); + + void (*update_sb)(void *data); + int (*get_stats)(void *data, struct md_bitmap_stats *stats); + + void (*sync_with_cluster)(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); + void *(*get_from_slot)(struct mddev *mddev, int slot); + int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, + sector_t *hi, bool clear_bits); + void (*set_pages)(void *data, unsigned long pages); + void (*free)(void *data); }; /* the bitmap API */ - -/* these are used only by md/bitmap */ -struct bitmap *md_bitmap_create(struct mddev *mddev, int slot); -int md_bitmap_load(struct mddev *mddev); -void md_bitmap_flush(struct mddev *mddev); -void md_bitmap_destroy(struct mddev *mddev); - -void md_bitmap_print_sb(struct bitmap *bitmap); -void md_bitmap_update_sb(struct bitmap *bitmap); -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap); - -int md_bitmap_setallbits(struct bitmap *bitmap); -void md_bitmap_write_all(struct bitmap *bitmap); - -void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); - -/* these are exported */ -int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); -void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); -int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); -void md_bitmap_close_sync(struct bitmap *bitmap); -void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); -void md_bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi); - -void md_bitmap_unplug(struct bitmap *bitmap); -void md_bitmap_unplug_async(struct bitmap *bitmap); -void md_bitmap_daemon_work(struct mddev *mddev); - -int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init); -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); -int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *lo, sector_t *hi, bool clear_bits); -void md_bitmap_free(struct bitmap *bitmap); -void md_bitmap_wait_behind_writes(struct mddev *mddev); - -static inline bool md_bitmap_enabled(struct bitmap *bitmap) -{ - return bitmap && bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); -} - -#endif +void mddev_set_bitmap_ops(struct mddev *mddev); #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1d0db62f0351..6595f89becdb 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -317,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread) str, ret); goto clear_bit; } - ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); + ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true); if (ret) { pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); goto clear_bit; @@ -497,8 +497,8 @@ static void process_suspend_info(struct mddev *mddev, * we don't want to trigger lots of WARN. */ if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) - md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, - cinfo->sync_hi, lo, hi); + mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, lo, hi); cinfo->sync_low = lo; cinfo->sync_hi = hi; @@ -628,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) break; case BITMAP_RESIZE: if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) - ret = md_bitmap_resize(mddev->bitmap, - le64_to_cpu(msg->high), 0, 0); + ret = mddev->bitmap_ops->resize(mddev, + le64_to_cpu(msg->high), + 0, false); break; default: ret = -1; @@ -856,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) } /* Read the disk bitmap sb and check if it needs recovery */ - ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false); + ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false); if (ret) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); lockres_free(bm_lockres); @@ -1143,13 +1144,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size) static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) { - struct bitmap_counts *counts; - char str[64]; - struct dlm_lock_resource *bm_lockres; - struct bitmap *bitmap = mddev->bitmap; - unsigned long my_pages = bitmap->counts.pages; + void *bitmap = mddev->bitmap; + struct md_bitmap_stats stats; + unsigned long my_pages; int i, rv; + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) + return rv; + + my_pages = stats.pages; /* * We need to ensure all the nodes can grow to a larger * bitmap size before make the reshaping. @@ -1159,17 +1163,22 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz return rv; for (i = 0; i < mddev->bitmap_info.nodes; i++) { + struct dlm_lock_resource *bm_lockres; + char str[64]; + if (i == md_cluster_ops->slot_number(mddev)) continue; - bitmap = get_bitmap_from_slot(mddev, i); + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); bitmap = NULL; goto out; } - counts = &bitmap->counts; + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) + goto out; /* * If we can hold the bitmap lock of one node then * the slot is not occupied, update the pages. @@ -1183,21 +1192,21 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - counts->pages = my_pages; + mddev->bitmap_ops->set_pages(bitmap, my_pages); lockres_free(bm_lockres); - if (my_pages != counts->pages) + if (my_pages != stats.pages) /* * Let's revert the bitmap size if one node * can't resize bitmap */ goto out; - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); } return 0; out: - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); update_bitmap_size(mddev, oldsize); return -1; } @@ -1207,24 +1216,27 @@ out: */ static int cluster_check_sync_size(struct mddev *mddev) { - int i, rv; - bitmap_super_t *sb; - unsigned long my_sync_size, sync_size = 0; - int node_num = mddev->bitmap_info.nodes; int current_slot = md_cluster_ops->slot_number(mddev); - struct bitmap *bitmap = mddev->bitmap; - char str[64]; + int node_num = mddev->bitmap_info.nodes; struct dlm_lock_resource *bm_lockres; + struct md_bitmap_stats stats; + void *bitmap = mddev->bitmap; + unsigned long sync_size = 0; + unsigned long my_sync_size; + char str[64]; + int i, rv; - sb = kmap_atomic(bitmap->storage.sb_page); - my_sync_size = sb->sync_size; - kunmap_atomic(sb); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) + return rv; + + my_sync_size = stats.sync_size; for (i = 0; i < node_num; i++) { if (i == current_slot) continue; - bitmap = get_bitmap_from_slot(mddev, i); + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); return -1; @@ -1238,25 +1250,28 @@ static int cluster_check_sync_size(struct mddev *mddev) bm_lockres = lockres_init(mddev, str, NULL, 1); if (!bm_lockres) { pr_err("md-cluster: Cannot initialize %s\n", str); - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); return -1; } bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - md_bitmap_update_sb(bitmap); + mddev->bitmap_ops->update_sb(bitmap); lockres_free(bm_lockres); - sb = kmap_atomic(bitmap->storage.sb_page); - if (sync_size == 0) - sync_size = sb->sync_size; - else if (sync_size != sb->sync_size) { - kunmap_atomic(sb); - md_bitmap_free(bitmap); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); + if (rv) { + mddev->bitmap_ops->free(bitmap); + return rv; + } + + if (sync_size == 0) { + sync_size = stats.sync_size; + } else if (sync_size != stats.sync_size) { + mddev->bitmap_ops->free(bitmap); return -1; } - kunmap_atomic(sb); - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); } return (my_sync_size == sync_size) ? 0 : -1; @@ -1585,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev) for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { if (sn == (cinfo->slot_number - 1)) continue; - err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); + err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false); if (err) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; diff --git a/drivers/md/md.c b/drivers/md/md.c index d3a837506a36..179ee4afe937 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -546,137 +546,30 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n return 0; } -/* - * Generic flush handling for md - */ - -static void md_end_flush(struct bio *bio) -{ - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - bio_put(bio); - - rdev_dec_pending(rdev, mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) +bool md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; - - mddev->start_flush = ktime_get_boottime(); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - struct bio *bi; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_bioset(rdev->bdev, 0, - REQ_OP_WRITE | REQ_PREFLUSH, - GFP_NOIO, &mddev->bio_set); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - atomic_inc(&mddev->flush_pending); - submit_bio(bi); - rcu_read_lock(); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; + struct bio *new; /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check + * md_flush_reqeust() should be called under md_handle_request() and + * 'active_io' is already grabbed. Hence it's safe to get rdev directly + * without rcu protection. */ - spin_lock_irq(&mddev->lock); - mddev->prev_flush_start = mddev->start_flush; - mddev->flush_bio = NULL; - spin_unlock_irq(&mddev->lock); - wake_up(&mddev->sb_wait); + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); - if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ - bio_endio(bio); - } else { - bio->bi_opf &= ~REQ_PREFLUSH; - - /* - * make_requst() will never return error here, it only - * returns error in raid5_make_request() by dm-raid. - * Since dm always splits data and flush operation into - * two separate io, io size of flush submitted by dm - * always is 0, make_request() will not be called here. - */ - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) - bio_io_error(bio); - } - - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); -} + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; -/* - * Manages consolidation of flushes and submitting any flushes needed for - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is - * being finished in another context. Returns false if the flushing is - * complete but still needs the I/O portion of the bio to be processed. - */ -bool md_flush_request(struct mddev *mddev, struct bio *bio) -{ - ktime_t req_start = ktime_get_boottime(); - spin_lock_irq(&mddev->lock); - /* flush requests wait until ongoing flush completes, - * hence coalescing all the pending requests. - */ - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio || - ktime_before(req_start, mddev->prev_flush_start), - mddev->lock); - /* new request after previous flush is completed */ - if (ktime_after(req_start, mddev->prev_flush_start)) { - WARN_ON(mddev->flush_bio); - /* - * Grab a reference to make sure mddev_suspend() will wait for - * this flush to be done. - * - * md_flush_reqeust() is called under md_handle_request() and - * 'active_io' is already grabbed, hence percpu_ref_is_zero() - * won't pass, percpu_ref_tryget_live() can't be used because - * percpu_ref_kill() can be called by mddev_suspend() - * concurrently. - */ - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); - percpu_ref_get(&mddev->active_io); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->lock); - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); - return true; + new = bio_alloc_bioset(rdev->bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, + &mddev->bio_set); + bio_chain(new, bio); + submit_bio(new); } - /* flush was performed for some other bio while we waited. */ - spin_unlock_irq(&mddev->lock); - if (bio->bi_iter.bi_size == 0) { - /* pure flush without data - all done */ + if (bio_sectors(bio) == 0) { bio_endio(bio); return true; } @@ -763,7 +656,6 @@ int mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; @@ -772,6 +664,7 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); @@ -1372,6 +1265,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor return ret; } +static u64 md_bitmap_events_cleared(struct mddev *mddev) +{ + struct md_bitmap_stats stats; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return 0; + + return stats.events_cleared; +} + /* * validate_super for 0.90.0 * note: we are not using "freshest" for 0.9 superblock @@ -1464,7 +1369,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); @@ -1991,7 +1896,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); @@ -2323,7 +2228,6 @@ super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* All necessary checks on new >= old have been done */ - struct bitmap *bitmap; if (new_offset >= rdev->data_offset) return 1; @@ -2340,11 +2244,18 @@ super_1_allow_new_offset(struct md_rdev *rdev, */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - bitmap = rdev->mddev->bitmap; - if (bitmap && !rdev->mddev->bitmap_info.file && - rdev->sb_start + rdev->mddev->bitmap_info.offset + - bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) - return 0; + + if (!rdev->mddev->bitmap_info.file) { + struct mddev *mddev = rdev->mddev; + struct md_bitmap_stats stats; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && rdev->sb_start + mddev->bitmap_info.offset + + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) + return 0; + } + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; @@ -2820,7 +2731,7 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ @@ -4142,6 +4053,34 @@ static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t +new_level_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->new_level); +} + +static ssize_t +new_level_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int err; + + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; + + mddev->new_level = n; + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + return len; +} +static struct md_sysfs_entry md_new_level = +__ATTR(new_level, 0664, new_level_show, new_level_store); + +static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ @@ -4680,17 +4619,23 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; + if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; } - if (*end && !isspace(*end)) break; - md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + + if (*end && !isspace(*end)) + break; + + mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } - md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ + mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; @@ -5666,6 +5611,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_new_level.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -6206,16 +6152,10 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - struct bitmap *bitmap; - - bitmap = md_bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) { - err = PTR_ERR(bitmap); + err = mddev->bitmap_ops->create(mddev, -1); + if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); - } else - mddev->bitmap = bitmap; - } if (err) goto bitmap_abort; @@ -6285,7 +6225,7 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; module_put(pers->owner); - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6304,9 +6244,10 @@ int do_md_run(struct mddev *mddev) err = md_run(mddev); if (err) goto out; - err = md_bitmap_load(mddev); + + err = mddev->bitmap_ops->load(mddev); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); goto out; } @@ -6450,7 +6391,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_bitmap_flush(mddev); + + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6477,7 +6419,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - md_bitmap_wait_behind_writes(mddev); + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6492,7 +6434,8 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - md_bitmap_destroy(mddev); + + mddev->bitmap_ops->destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -7270,22 +7213,19 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - struct bitmap *bitmap; + err = mddev->bitmap_ops->create(mddev, -1); + if (!err) + err = mddev->bitmap_ops->load(mddev); - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; - err = md_bitmap_load(mddev); - } else - err = PTR_ERR(bitmap); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); fd = -1; } } else if (fd < 0) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } } + if (fd < 0) { struct file *f = mddev->bitmap_info.file; if (f) { @@ -7554,7 +7494,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) goto err; } if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { - struct bitmap *bitmap; /* add the bitmap */ if (mddev->bitmap) { rv = -EEXIST; @@ -7568,24 +7507,24 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; - rv = md_bitmap_load(mddev); - } else - rv = PTR_ERR(bitmap); + rv = mddev->bitmap_ops->create(mddev, -1); + if (!rv) + rv = mddev->bitmap_ops->load(mddev); + if (rv) - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } else { - /* remove the bitmap */ - if (!mddev->bitmap) { - rv = -ENOENT; + struct md_bitmap_stats stats; + + rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (rv) goto err; - } - if (mddev->bitmap->storage.file) { + + if (stats.file) { rv = -EINVAL; goto err; } + if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { @@ -7600,7 +7539,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -8370,6 +8309,33 @@ static void md_seq_stop(struct seq_file *seq, void *v) spin_unlock(&all_mddevs_lock); } +static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) +{ + struct md_bitmap_stats stats; + unsigned long used_pages; + unsigned long chunk_kb; + int err; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + chunk_kb = mddev->bitmap_info.chunksize >> 10; + used_pages = stats.pages - stats.missing_pages; + + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", + used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + + if (stats.file) { + seq_puts(seq, ", file: "); + seq_file_path(seq, stats.file, " \t\n"); + } + + seq_putc(seq, '\n'); +} + static int md_seq_show(struct seq_file *seq, void *v) { struct mddev *mddev; @@ -8390,14 +8356,19 @@ static int md_seq_show(struct seq_file *seq, void *v) spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { - seq_printf(seq, "%s : %sactive", mdname(mddev), - mddev->pers ? "" : "in"); + seq_printf(seq, "%s : ", mdname(mddev)); if (mddev->pers) { + if (test_bit(MD_BROKEN, &mddev->flags)) + seq_printf(seq, "broken"); + else + seq_printf(seq, "active"); if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); + } else { + seq_printf(seq, "inactive"); } sectors = 0; @@ -8453,7 +8424,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - md_bitmap_status(seq, mddev->bitmap); + md_bitmap_status(seq, mddev); seq_printf(seq, "\n"); } @@ -8668,7 +8639,6 @@ void md_write_start(struct mddev *mddev, struct bio *bi) BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -9506,7 +9476,7 @@ static void md_start_sync(struct work_struct *ws) * stored on all devices. So make sure all bitmap pages get written. */ if (spares) - md_bitmap_write_all(mddev->bitmap); + mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? "reshape" : "resync"; @@ -9594,7 +9564,7 @@ static void unregister_sync_thread(struct mddev *mddev) void md_check_recovery(struct mddev *mddev) { if (mddev->bitmap) - md_bitmap_daemon_work(mddev); + mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { @@ -9965,7 +9935,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (ret) pr_info("md-cluster: resize failed\n"); else - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ diff --git a/drivers/md/md.h b/drivers/md/md.h index a0d6827dced9..5d2e6bd58e4d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -535,7 +535,8 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ - struct bitmap *bitmap; /* the bitmap for the device */ + void *bitmap; /* the bitmap for the device */ + struct bitmap_operations *bitmap_ops; struct { struct file *file; /* the bitmap file */ loff_t offset; /* offset from superblock of @@ -571,16 +572,6 @@ struct mddev { */ struct bio_set io_clone_set; - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed - * flush was started. - */ - struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 2ea1710a3b70..4378d3250bd7 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!md_bitmap_enabled(mddev->bitmap)) { + if (!mddev->bitmap_ops->enabled(mddev)) { raid1_submit_write(bio); return true; } @@ -166,12 +166,9 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * while current io submission must wait for bitmap io to be done. In order to * avoid such deadlock, submit bitmap io asynchronously. */ -static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +static inline void raid1_prepare_flush_writes(struct mddev *mddev) { - if (current->bio_list) - md_bitmap_unplug_async(bitmap); - else - md_bitmap_unplug(bitmap); + mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL); } /* diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 761989d67906..6c9d24203f39 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -411,18 +411,20 @@ static void raid1_end_read_request(struct bio *bio) static void close_write(struct r1bio *r1_bio) { + struct mddev *mddev = r1_bio->mddev; + /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { bio_free_pages(r1_bio->behind_master_bio); bio_put(r1_bio->behind_master_bio); r1_bio->behind_master_bio = NULL; } + /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); - md_write_end(r1_bio->mddev); + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(mddev); } static void r1_bio_write_done(struct r1bio *r1_bio) @@ -900,7 +902,7 @@ static void wake_up_barrier(struct r1conf *conf) static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - raid1_prepare_flush_writes(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev); wake_up_barrier(conf); while (bio) { /* submit pending writes */ @@ -1317,13 +1319,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct bio *read_bio; - struct bitmap *bitmap = mddev->bitmap; const enum req_op op = bio_op(bio); const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; int rdisk; bool r1bio_existed = !!r1_bio; - char b[BDEVNAME_SIZE]; /* * If r1_bio is set, we are blocking the raid1d thread @@ -1332,16 +1332,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, */ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; - if (r1bio_existed) { - /* Need to get the block device name carefully */ - struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; - - if (rdev) - snprintf(b, sizeof(b), "%pg", rdev->bdev); - else - strcpy(b, "???"); - } - /* * Still need barrier for READ in case that whole * array is frozen. @@ -1363,15 +1353,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * used and no empty request is available. */ rdisk = read_balance(conf, r1_bio, &max_sectors); - if (rdisk < 0) { /* couldn't find anywhere to read from */ - if (r1bio_existed) { - pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", + if (r1bio_existed) + pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n", mdname(mddev), - b, - (unsigned long long)r1_bio->sector); - } + conf->mirrors[r1_bio->read_disk].rdev->bdev, + r1_bio->sector); raid_end_bio_io(r1_bio); return; } @@ -1383,15 +1371,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags) && - bitmap) { + if (test_bit(WriteMostly, &mirror->rdev->flags)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' */ mddev_add_trace_msg(mddev, "raid1 wait behind writes"); - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); + mddev->bitmap_ops->wait_behind_writes(mddev); } if (max_sectors < bio_sectors(bio)) { @@ -1432,7 +1418,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct r1bio *r1_bio; int i, disks; - struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct md_rdev *blocked_rdev; int first_clone; @@ -1585,7 +1570,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * at a time and thus needs a new bio that can fit the whole payload * this bio in page sized chunks. */ - if (write_behind && bitmap) + if (write_behind && mddev->bitmap) max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { @@ -1612,19 +1597,23 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { + unsigned long max_write_behind = + mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + /* do behind I/O ? * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && write_behind && - (atomic_read(&bitmap->behind_writes) - < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) { + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && write_behind && !stats.behind_wait && + stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); - } - md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + mddev->bitmap_ops->startwrite( + mddev, r1_bio->sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); first_clone = 0; } @@ -2042,7 +2031,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2771,7 +2760,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int wonly = -1; int write_targets = 0, read_targets = 0; sector_t sync_blocks; - int still_degraded = 0; + bool still_degraded = false; int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ int idx = sector_to_idx(sector_nr); @@ -2788,12 +2777,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2813,7 +2802,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2831,9 +2820,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * sector_nr + two times RESYNC_SECTORS */ - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, - mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -2864,7 +2853,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) - still_degraded = 1; + still_degraded = true; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_opf = REQ_OP_WRITE; bio->bi_end_io = end_sync_write; @@ -2988,8 +2977,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, still_degraded) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3313,14 +3302,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); + int ret; + if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); - if (ret) - return ret; - } + + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2a9c4ee982e0..f3bf1116794a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -426,12 +426,13 @@ static void raid10_end_read_request(struct bio *bio) static void close_write(struct r10bio *r10_bio) { + struct mddev *mddev = r10_bio->mddev; + /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(r10_bio->mddev); + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + false); + md_write_end(mddev); } static void one_write_done(struct r10bio *r10_bio) @@ -884,7 +885,7 @@ static void flush_pending_writes(struct r10conf *conf) __set_current_state(TASK_RUNNING); blk_start_plug(&plug); - raid1_prepare_flush_writes(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1100,7 +1101,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - raid1_prepare_flush_writes(mddev->bitmap); + raid1_prepare_flush_writes(mddev); wake_up_barrier(conf); while (bio) { /* submit pending writes */ @@ -1492,7 +1493,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, md_account_bio(mddev, &bio); r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); - md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); + mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors, + false); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) @@ -2465,7 +2467,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) s = PAGE_SIZE >> 9; rdev = conf->mirrors[dr].rdev; - addr = r10_bio->devs[0].addr + sect, + addr = r10_bio->devs[0].addr + sect; ok = sync_page_io(rdev, addr, s << 9, @@ -3192,13 +3194,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, + mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - md_bitmap_end_sync(mddev->bitmap, sect, - &sync_blocks, 1); + + mddev->bitmap_ops->end_sync(mddev, sect, + &sync_blocks); } } else { /* completed sync */ @@ -3218,7 +3222,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3287,10 +3291,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio = NULL; for (i = 0 ; i < conf->geo.raid_disks; i++) { - int still_degraded; + bool still_degraded; struct r10bio *rb2; sector_t sect; - int must_sync; + bool must_sync; int any_working; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3307,7 +3311,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (!mrdev && !mreplace) continue; - still_degraded = 0; + still_degraded = false; /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); @@ -3320,8 +3324,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, + &sync_blocks, + true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3359,13 +3364,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, struct md_rdev *rdev = conf->mirrors[j].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - still_degraded = 1; + still_degraded = false; break; } } - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, + &sync_blocks, still_degraded); any_working = 0; for (j=0; j<conf->copies;j++) { @@ -3538,12 +3543,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, mddev->degraded) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, + &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ @@ -4190,6 +4196,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; + int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4202,11 +4209,11 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) if (mddev->external_size && mddev->array_sectors > size) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); - if (ret) - return ret; - } + + ret = mddev->bitmap_ops->resize(mddev, size, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { @@ -4472,7 +4479,7 @@ static int raid10_start_reshape(struct mddev *mddev) newsize = raid10_size(mddev, 0, conf->geo.raid_disks); if (!mddev_is_clustered(mddev)) { - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) goto abort; else @@ -4487,20 +4494,20 @@ static int raid10_start_reshape(struct mddev *mddev) /* * some node is already performing reshape, and no need to - * call md_bitmap_resize again since it should be called when + * call bitmap_ops->resize again since it should be called when * receiving BITMAP_RESIZE msg */ if ((sb && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) goto abort; ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); + mddev->bitmap_ops->resize(mddev, oldsize, 0, false); goto abort; } } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 874874fe4fa1..b4f7b79fd187 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -313,10 +313,10 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + false); } } } @@ -2798,7 +2798,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, { struct r5l_log *log = READ_ONCE(conf->log); int i; - int do_wakeup = 0; sector_t tree_index; void __rcu **pslot; uintptr_t refcount; @@ -2815,7 +2814,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, for (i = sh->disks; i--; ) { clear_bit(R5_InJournal, &sh->dev[i].flags); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&sh->dev[i].flags, R5_Overlap); } /* @@ -2828,9 +2827,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); - if (do_wakeup) - wake_up(&conf->wait_for_overlap); - spin_lock_irq(&log->stripe_in_journal_lock); list_del_init(&sh->r5c); spin_unlock_irq(&log->stripe_in_journal_lock); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c14cf2410365..dc2ea636d173 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2337,7 +2337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&sh->raid_conf->wait_for_overlap); + wake_up_bit(&dev->flags, R5_Overlap); } } local_unlock(&conf->percpu->lock); @@ -3473,7 +3473,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, * With PPL only writes to consecutive data chunks within a * stripe are allowed because for a single stripe_head we can * only have one PPL entry at a time, which describes one data - * range. Not really an overlap, but wait_for_overlap can be + * range. Not really an overlap, but R5_Overlap can be * used to handle this. */ sector_t sector; @@ -3563,8 +3563,8 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, */ set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); - md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0); + conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, + RAID5_STRIPE_SECTORS(conf), false); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3652,7 +3652,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, log_stripe_write_finished(sh); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[i].flags, R5_Overlap); while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { @@ -3663,8 +3663,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = nextbi; } if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + false, false); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3696,7 +3697,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].toread = NULL; spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[i].flags, R5_Overlap); if (bi) s->to_read--; while (bi && bi->bi_iter.bi_sector < @@ -3709,8 +3710,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } } if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + false, false); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3734,7 +3736,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, BUG_ON(sh->batch_head); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. @@ -4059,10 +4061,10 @@ returnbi: bio_endio(wbi); wbi = wbi2; } - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + false); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -4875,7 +4877,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, { struct stripe_head *sh, *next; int i; - int do_wakeup = 0; list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { @@ -4911,7 +4912,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, spin_unlock_irq(&sh->stripe_lock); for (i = 0; i < sh->disks; i++) { if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&sh->dev[i].flags, R5_Overlap); sh->dev[i].flags = head_sh->dev[i].flags & (~((1 << R5_WriteError) | (1 << R5_Overlap))); } @@ -4925,12 +4926,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, spin_unlock_irq(&head_sh->stripe_lock); for (i = 0; i < head_sh->disks; i++) if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&head_sh->dev[i].flags, R5_Overlap); if (head_sh->state & handle_flags) set_bit(STRIPE_HANDLE, &head_sh->state); - - if (do_wakeup) - wake_up(&head_sh->raid_conf->wait_for_overlap); } static void handle_stripe(struct stripe_head *sh) @@ -5196,7 +5194,7 @@ static void handle_stripe(struct stripe_head *sh) md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); } /* If the failed drives are just a ReadError, then we might need @@ -5259,7 +5257,7 @@ static void handle_stripe(struct stripe_head *sh) } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); } @@ -5753,12 +5751,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) int d; again: sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); if (test_bit(STRIPE_SYNCING, &sh->state)) { raid5_release_stripe(sh); - schedule(); + wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap, + TASK_UNINTERRUPTIBLE); goto again; } clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); @@ -5770,12 +5767,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) set_bit(R5_Overlap, &sh->dev[d].flags); spin_unlock_irq(&sh->stripe_lock); raid5_release_stripe(sh); - schedule(); + wait_on_bit(&sh->dev[d].flags, R5_Overlap, + TASK_UNINTERRUPTIBLE); goto again; } } set_bit(STRIPE_DISCARD, &sh->state); - finish_wait(&conf->wait_for_overlap, &w); sh->overwrite_disks = 0; for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -5788,13 +5785,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { - for (d = 0; - d < conf->raid_disks - conf->max_degraded; + for (d = 0; d < conf->raid_disks - conf->max_degraded; d++) - md_bitmap_startwrite(mddev->bitmap, - sh->sector, - RAID5_STRIPE_SECTORS(conf), - 0); + mddev->bitmap_ops->startwrite(mddev, sh->sector, + RAID5_STRIPE_SECTORS(conf), false); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } @@ -5855,7 +5849,6 @@ static int add_all_stripe_bios(struct r5conf *conf, struct bio *bi, int forwrite, int previous) { int dd_idx; - int ret = 1; spin_lock_irq(&sh->stripe_lock); @@ -5871,14 +5864,19 @@ static int add_all_stripe_bios(struct r5conf *conf, if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { set_bit(R5_Overlap, &dev->flags); - ret = 0; - continue; + spin_unlock_irq(&sh->stripe_lock); + raid5_release_stripe(sh); + /* release batch_last before wait to avoid risk of deadlock */ + if (ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + md_wakeup_thread(conf->mddev->thread); + wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE); + return 0; } } - if (!ret) - goto out; - for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { struct r5dev *dev = &sh->dev[dd_idx]; @@ -5894,9 +5892,8 @@ static int add_all_stripe_bios(struct r5conf *conf, RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); } -out: spin_unlock_irq(&sh->stripe_lock); - return ret; + return 1; } enum reshape_loc { @@ -5992,17 +5989,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, goto out_release; } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { - /* - * Stripe is busy expanding or add failed due to - * overlap. Flush everything and wait a while. - */ + if (test_bit(STRIPE_EXPANDING, &sh->state)) { md_wakeup_thread(mddev->thread); ret = STRIPE_SCHEDULE_AND_RETRY; goto out_release; } + if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { + ret = STRIPE_RETRY; + goto out; + } + if (stripe_can_batch(sh)) { stripe_add_to_batch_list(conf, sh, ctx->batch_last); if (ctx->batch_last) @@ -6073,6 +6070,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); + bool on_wq; struct r5conf *conf = mddev->private; sector_t logical_sector; struct stripe_request_ctx ctx = {}; @@ -6146,11 +6144,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * sequential IO pattern. We don't bother with the optimization when * reshaping as the performance benefit is not worth the complexity. */ - if (likely(conf->reshape_progress == MaxSector)) + if (likely(conf->reshape_progress == MaxSector)) { logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); + on_wq = false; + } else { + add_wait_queue(&conf->wait_for_reshape, &wait); + on_wq = true; + } s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); - add_wait_queue(&conf->wait_for_overlap, &wait); while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); @@ -6161,6 +6163,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; if (res == STRIPE_SCHEDULE_AND_RETRY) { + WARN_ON_ONCE(!on_wq); /* * Must release the reference to batch_last before * scheduling and waiting for work to be done, @@ -6185,7 +6188,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) logical_sector = ctx.first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } - remove_wait_queue(&conf->wait_for_overlap, &wait); + if (unlikely(on_wq)) + remove_wait_queue(&conf->wait_for_reshape, &wait); if (ctx.batch_last) raid5_release_stripe(ctx.batch_last); @@ -6338,7 +6342,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk : (safepos < writepos && readpos > writepos)) || time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, + wait_event(conf->wait_for_reshape, atomic_read(&conf->reshape_stripes)==0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (atomic_read(&conf->reshape_stripes) != 0) @@ -6364,7 +6368,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); sysfs_notify_dirent_safe(mddev->sysfs_completed); } @@ -6447,7 +6451,7 @@ finish: (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, + wait_event(conf->wait_for_reshape, atomic_read(&conf->reshape_stripes) == 0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (atomic_read(&conf->reshape_stripes) != 0) @@ -6473,7 +6477,7 @@ finish: spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: @@ -6486,7 +6490,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n struct r5conf *conf = mddev->private; struct stripe_head *sh; sector_t sync_blocks; - int still_degraded = 0; + bool still_degraded = false; int i; if (sector_nr >= max_sector) { @@ -6498,17 +6502,17 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); return 0; } /* Allow raid5_quiesce to complete */ - wait_event(conf->wait_for_overlap, conf->quiesce != 2); + wait_event(conf->wait_for_reshape, conf->quiesce != 2); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); @@ -6531,7 +6535,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, + true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6540,7 +6545,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); @@ -6559,10 +6564,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n struct md_rdev *rdev = conf->disks[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) - still_degraded = 1; + still_degraded = true; } - md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); + mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, + still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -6767,7 +6773,7 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - md_bitmap_unplug(mddev->bitmap); + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -7492,7 +7498,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); + init_waitqueue_head(&conf->wait_for_reshape); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->loprio_list); INIT_LIST_HEAD(&conf->hold_list); @@ -8312,6 +8318,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; + int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8320,11 +8327,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); - if (ret) - return ret; - } + + ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { @@ -8550,7 +8557,7 @@ static void end_reshape(struct r5conf *conf) !test_bit(In_sync, &rdev->flags)) rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); mddev_update_io_opt(conf->mddev, conf->raid_disks - conf->max_degraded); @@ -8614,13 +8621,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) conf->quiesce = 1; unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); } else { /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); unlock_all_device_hash_locks_irq(conf); } log_quiesce(conf, quiesce); @@ -8939,7 +8946,7 @@ static void raid5_prepare_suspend(struct mddev *mddev) { struct r5conf *conf = mddev->private; - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); } static struct md_personality raid6_personality = diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9b5a7dc3f2a0..896ecfc4afa6 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -668,7 +668,7 @@ struct r5conf { struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; wait_queue_head_t wait_for_stripe; - wait_queue_head_t wait_for_overlap; + wait_queue_head_t wait_for_reshape; unsigned long cache_state; struct shrinker *shrinker; int pool_size; /* number of disks in stripeheads in pool */ diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index 6f7e7a8fa5ae..ed5167f942d8 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -20,6 +20,28 @@ key_serial_t nvme_keyring_id(void) } EXPORT_SYMBOL_GPL(nvme_keyring_id); +static bool nvme_tls_psk_revoked(struct key *psk) +{ + return test_bit(KEY_FLAG_REVOKED, &psk->flags) || + test_bit(KEY_FLAG_INVALIDATED, &psk->flags); +} + +struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + struct key *key = key_lookup(key_id); + + if (IS_ERR(key)) { + pr_err("key id %08x not found\n", key_id); + return key; + } + if (nvme_tls_psk_revoked(key)) { + pr_err("key id %08x revoked\n", key_id); + return ERR_PTR(-EKEYREVOKED); + } + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_key_lookup); + static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); @@ -36,14 +58,12 @@ static bool nvme_tls_psk_match(const struct key *key, pr_debug("%s: no key description\n", __func__); return false; } - match_len = strlen(key->description); - pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len); - if (!match_data->raw_data) { pr_debug("%s: no match data\n", __func__); return false; } match_id = match_data->raw_data; + match_len = strlen(match_id); pr_debug("%s: match '%s' '%s' len %zd\n", __func__, match_id, key->description, match_len); return !memcmp(key->description, match_id, match_len); @@ -71,7 +91,7 @@ static struct key_type nvme_tls_psk_key_type = { static struct key *nvme_tls_psk_lookup(struct key *keyring, const char *hostnqn, const char *subnqn, - int hmac, bool generated) + u8 hmac, u8 psk_ver, bool generated) { char *identity; size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11; @@ -82,8 +102,8 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, if (!identity) return ERR_PTR(-ENOMEM); - snprintf(identity, identity_len, "NVMe0%c%02d %s %s", - generated ? 'G' : 'R', hmac, hostnqn, subnqn); + snprintf(identity, identity_len, "NVMe%u%c%02u %s %s", + psk_ver, generated ? 'G' : 'R', hmac, hostnqn, subnqn); if (!keyring) keyring = nvme_keyring; @@ -107,21 +127,38 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, /* * NVMe PSK priority list * - * 'Retained' PSKs (ie 'generated == false') - * should be preferred to 'generated' PSKs, - * and SHA-384 should be preferred to SHA-256. + * 'Retained' PSKs (ie 'generated == false') should be preferred to 'generated' + * PSKs, PSKs with hash (psk_ver 1) should be preferred to PSKs without hash + * (psk_ver 0), and SHA-384 should be preferred to SHA-256. */ static struct nvme_tls_psk_priority_list { bool generated; + u8 psk_ver; enum nvme_tcp_tls_cipher cipher; } nvme_tls_psk_prio[] = { { .generated = false, + .psk_ver = 1, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = false, + .psk_ver = 1, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, + { .generated = false, + .psk_ver = 0, .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, { .generated = false, + .psk_ver = 0, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, + { .generated = true, + .psk_ver = 1, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = true, + .psk_ver = 1, .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, { .generated = true, + .psk_ver = 0, .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, { .generated = true, + .psk_ver = 0, .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, }; @@ -137,10 +174,11 @@ key_serial_t nvme_tls_psk_default(struct key *keyring, for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) { bool generated = nvme_tls_psk_prio[prio].generated; + u8 ver = nvme_tls_psk_prio[prio].psk_ver; enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher; tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn, - cipher, generated); + cipher, ver, generated); if (!IS_ERR(tls_key)) { tls_key_id = tls_key->serial; key_put(tls_key); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index a3caef75aa0a..486afe598184 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -41,6 +41,7 @@ config NVME_HWMON config NVME_FABRICS select NVME_CORE + select NVME_KEYRING if NVME_TCP_TLS tristate config NVME_RDMA @@ -94,7 +95,6 @@ config NVME_TCP config NVME_TCP_TLS bool "NVMe over Fabrics TCP TLS encryption support" depends on NVME_TCP - select NVME_KEYRING select NET_HANDSHAKE select KEYS help @@ -109,6 +109,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH + select NVME_KEYRING if NVME_TCP_TLS help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 983909a600ad..ca9959a8fb9e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4,6 +4,7 @@ * Copyright (c) 2011-2014, Intel Corporation. */ +#include <linux/async.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-integrity.h> @@ -987,8 +988,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1); cmnd->rw.reftag = 0; - cmnd->rw.apptag = 0; - cmnd->rw.appmask = 0; + cmnd->rw.lbat = 0; + cmnd->rw.lbatm = 0; if (ns->head->ms) { /* @@ -4040,6 +4041,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) } } +/** + * struct async_scan_info - keeps track of controller & NSIDs to scan + * @ctrl: Controller on which namespaces are being scanned + * @next_nsid: Index of next NSID to scan in ns_list + * @ns_list: Pointer to list of NSIDs to scan + * + * Note: There is a single async_scan_info structure shared by all instances + * of nvme_scan_ns_async() scanning a given controller, so the atomic + * operations on next_nsid are critical to ensure each instance scans a unique + * NSID. + */ +struct async_scan_info { + struct nvme_ctrl *ctrl; + atomic_t next_nsid; + __le32 *ns_list; +}; + +static void nvme_scan_ns_async(void *data, async_cookie_t cookie) +{ + struct async_scan_info *scan_info = data; + int idx; + u32 nsid; + + idx = (u32)atomic_fetch_inc(&scan_info->next_nsid); + nsid = le32_to_cpu(scan_info->ns_list[idx]); + + nvme_scan_ns(scan_info->ctrl, nsid); +} + static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid) { @@ -4066,11 +4096,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) __le32 *ns_list; u32 prev = 0; int ret = 0, i; + ASYNC_DOMAIN(domain); + struct async_scan_info scan_info; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; + scan_info.ctrl = ctrl; + scan_info.ns_list = ns_list; for (;;) { struct nvme_command cmd = { .identify.opcode = nvme_admin_identify, @@ -4086,19 +4120,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) goto free; } + atomic_set(&scan_info.next_nsid, 0); for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); if (!nsid) /* end of the list? */ goto out; - nvme_scan_ns(ctrl, nsid); + async_schedule_domain(nvme_scan_ns_async, &scan_info, + &domain); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } + async_synchronize_full_domain(&domain); } out: nvme_remove_invalid_namespaces(ctrl, prev); free: + async_synchronize_full_domain(&domain); kfree(ns_list); return ret; } @@ -4568,7 +4606,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) set->flags |= BLK_MQ_F_BLOCKING; - set->cmd_size = cmd_size, + set->cmd_size = cmd_size; set->driver_data = ctrl; set->nr_hw_queues = ctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; @@ -4678,7 +4716,6 @@ static void nvme_free_ctrl(struct device *dev) if (!subsys || ctrl->instance != subsys->instance) ida_free(&nvme_instance_ida, ctrl->instance); - key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); cleanup_srcu_struct(&ctrl->srcu); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f5f545fa0103..432efcbf9e2f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -665,7 +665,7 @@ static struct key *nvmf_parse_key(int key_id) return ERR_PTR(-EINVAL); } - key = key_lookup(key_id); + key = nvme_tls_key_lookup(key_id); if (IS_ERR(key)) pr_err("key id %08x not found\n", key_id); else diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f1d58e70933f..1d769c842fbf 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -4,6 +4,7 @@ * Copyright (c) 2017-2021 Christoph Hellwig. */ #include <linux/bio-integrity.h> +#include <linux/blk-integrity.h> #include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/nvme_ioctl.h> #include <linux/io_uring/cmd.h> @@ -119,9 +120,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; + bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); + bool has_metadata = meta_buffer && meta_len; struct bio *bio = NULL; int ret; + if (has_metadata && !supports_metadata) + return -EINVAL; + if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { struct iov_iter iter; @@ -143,15 +149,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, goto out; bio = req->bio; - if (bdev) { + if (bdev) bio_set_dev(bio, bdev); - if (meta_buffer && meta_len) { - ret = bio_integrity_map_user(bio, meta_buffer, meta_len, - meta_seed); - if (ret) - goto out_unmap; - req->cmd_flags |= REQ_INTEGRITY; - } + + if (has_metadata) { + ret = bio_integrity_map_user(bio, meta_buffer, meta_len, + meta_seed); + if (ret) + goto out_unmap; + req->cmd_flags |= REQ_INTEGRITY; } return ret; @@ -260,8 +266,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.control = cpu_to_le16(io.control); c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); + c.rw.lbat = cpu_to_le16(io.apptag); + c.rw.lbatm = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0, 0); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index da57947130cc..313a4f978a2c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -91,6 +91,11 @@ enum nvme_quirks { NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), /* + * Problems seen with concurrent commands + */ + NVME_QUIRK_QDEPTH_ONE = (1 << 6), + + /* * Set MEDIUM priority on SQ creation */ NVME_QUIRK_MEDIUM_PRIO_SQ = (1 << 7), @@ -372,7 +377,7 @@ struct nvme_ctrl { struct nvme_dhchap_key *ctrl_key; u16 transaction; #endif - struct key *tls_key; + key_serial_t tls_pskid; /* Power saving configuration */ u64 ps_max_latency_us; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c0533f3f64cb..7990c3f22ecf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2563,15 +2563,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) else dev->io_sqes = NVME_NVM_IOSQES; - /* - * Temporary fix for the Apple controller found in the MacBook8,1 and - * some MacBook7,1 to avoid controller resets and data loss. - */ - if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { + if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) { dev->q_depth = 2; - dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " - "set queue depth=%u to work around controller resets\n", - dev->q_depth); } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && (pdev->device == 0xa821 || pdev->device == 0xa822) && NVME_CAP_MQES(dev->ctrl.cap) == 0) { @@ -3442,6 +3435,8 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ + .driver_data = NVME_QUIRK_QDEPTH_ONE }, { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_BOGUS_NID, }, @@ -3576,7 +3571,12 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), - .driver_data = NVME_QUIRK_SINGLE_VECTOR }, + /* + * Fix for the Apple controller found in the MacBook8,1 and + * some MacBook7,1 to avoid controller resets and data loss. + */ + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_QDEPTH_ONE }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), .driver_data = NVME_QUIRK_SINGLE_VECTOR | diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2eb33842f971..15b5e06039a5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1363,8 +1363,8 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; @@ -1876,6 +1876,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) */ priv.hrqsize = cpu_to_le16(queue->queue_size); priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); + /* cntlid should only be set when creating an I/O queue */ + priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid); } ret = rdma_connect_locked(queue->cm_id, ¶m); diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index ba05faaac562..eb345551d6fe 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -664,19 +664,6 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); #endif -#ifdef CONFIG_NVME_TCP_TLS -static ssize_t tls_key_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (!ctrl->tls_key) - return 0; - return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key)); -} -static DEVICE_ATTR_RO(tls_key); -#endif - static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -704,9 +691,6 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_dhchap_secret.attr, &dev_attr_dhchap_ctrl_secret.attr, #endif -#ifdef CONFIG_NVME_TCP_TLS - &dev_attr_tls_key.attr, -#endif &dev_attr_adm_passthru_err_log_enabled.attr, NULL }; @@ -737,11 +721,6 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) return 0; #endif -#ifdef CONFIG_NVME_TCP_TLS - if (a == &dev_attr_tls_key.attr && - (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp"))) - return 0; -#endif return a->mode; } @@ -752,8 +731,77 @@ const struct attribute_group nvme_dev_attrs_group = { }; EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); +#ifdef CONFIG_NVME_TCP_TLS +static ssize_t tls_key_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (!ctrl->tls_pskid) + return 0; + return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid); +} +static DEVICE_ATTR_RO(tls_key); + +static ssize_t tls_configured_key_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct key *key = ctrl->opts->tls_key; + + return sysfs_emit(buf, "%08x\n", key_serial(key)); +} +static DEVICE_ATTR_RO(tls_configured_key); + +static ssize_t tls_keyring_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct key *keyring = ctrl->opts->keyring; + + return sysfs_emit(buf, "%s\n", keyring->description); +} +static DEVICE_ATTR_RO(tls_keyring); + +static struct attribute *nvme_tls_attrs[] = { + &dev_attr_tls_key.attr, + &dev_attr_tls_configured_key.attr, + &dev_attr_tls_keyring.attr, +}; + +static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")) + return 0; + + if (a == &dev_attr_tls_key.attr && + !ctrl->opts->tls) + return 0; + if (a == &dev_attr_tls_configured_key.attr && + !ctrl->opts->tls_key) + return 0; + if (a == &dev_attr_tls_keyring.attr && + !ctrl->opts->keyring) + return 0; + + return a->mode; +} + +const struct attribute_group nvme_tls_attrs_group = { + .attrs = nvme_tls_attrs, + .is_visible = nvme_tls_attrs_are_visible, +}; +#endif + const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, +#ifdef CONFIG_NVME_TCP_TLS + &nvme_tls_attrs_group, +#endif NULL, }; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a2a47d3ab99f..89c44413c593 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -165,6 +165,7 @@ struct nvme_tcp_queue { bool hdr_digest; bool data_digest; + bool tls_enabled; struct ahash_request *rcv_hash; struct ahash_request *snd_hash; __le32 exp_ddgst; @@ -213,7 +214,21 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } -static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl) +/* + * Check if the queue is TLS encrypted + */ +static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return queue->tls_enabled; +} + +/* + * Check if TLS is configured for the controller. + */ +static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) { if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0; @@ -368,7 +383,7 @@ static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue) static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) { - return !nvme_tcp_tls(&queue->ctrl->ctrl) && + return !nvme_tcp_queue_tls(queue) && nvme_tcp_queue_has_pending(queue); } @@ -1051,7 +1066,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else msg.msg_flags |= MSG_MORE; - if (!sendpage_ok(page)) + if (!sendpages_ok(page, len, offset)) msg.msg_flags &= ~MSG_SPLICE_PAGES; bvec_set_page(&bvec, page, len, offset); @@ -1427,7 +1442,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { + if (nvme_tcp_queue_tls(queue)) { msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } @@ -1439,7 +1454,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } ret = -ENOTCONN; - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { + if (nvme_tcp_queue_tls(queue)) { ctype = tls_get_record_type(queue->sock->sk, (struct cmsghdr *)cbuf); if (ctype != TLS_RECORD_TYPE_DATA) { @@ -1581,13 +1596,16 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) goto out_complete; } - tls_key = key_lookup(pskid); + tls_key = nvme_tls_key_lookup(pskid); if (IS_ERR(tls_key)) { dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", qid, pskid); queue->tls_err = -ENOKEY; } else { - ctrl->ctrl.tls_key = tls_key; + queue->tls_enabled = true; + if (qid == 0) + ctrl->ctrl.tls_pskid = key_serial(tls_key); + key_put(tls_key); queue->tls_err = 0; } @@ -1768,7 +1786,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, } /* If PSKs are configured try to start TLS */ - if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) { + if (nvme_tcp_tls_configured(nctrl) && pskid) { ret = nvme_tcp_start_tls(nctrl, queue, pskid); if (ret) goto err_init_connect; @@ -1829,6 +1847,8 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) mutex_lock(&queue->queue_lock); if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) __nvme_tcp_stop_queue(queue); + /* Stopping the queue will disable TLS */ + queue->tls_enabled = false; mutex_unlock(&queue->queue_lock); } @@ -1925,16 +1945,17 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) int ret; key_serial_t pskid = 0; - if (nvme_tcp_tls(ctrl)) { + if (nvme_tcp_tls_configured(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); - else + else { pskid = nvme_tls_psk_default(ctrl->opts->keyring, ctrl->opts->host->nqn, ctrl->opts->subsysnqn); - if (!pskid) { - dev_err(ctrl->device, "no valid PSK found\n"); - return -ENOKEY; + if (!pskid) { + dev_err(ctrl->device, "no valid PSK found\n"); + return -ENOKEY; + } } } @@ -1957,13 +1978,14 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) { + if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { dev_err(ctrl->device, "no PSK negotiated\n"); return -ENOKEY; } + for (i = 1; i < ctrl->queue_count; i++) { ret = nvme_tcp_alloc_queue(ctrl, i, - key_serial(ctrl->tls_key)); + ctrl->tls_pskid); if (ret) goto out_free_queues; } @@ -2144,6 +2166,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) nvme_unquiesce_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); + if (ctrl->tls_pskid) { + dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n", + ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 85006b2df8ae..954d4c074770 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1015,8 +1015,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_admin_cmd(req); - if (unlikely(!nvmet_check_auth_status(req))) - return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) return nvmet_parse_discovery_cmd(req); diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 8bc3f431c77f..7897d02c681d 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -25,6 +25,18 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, unsigned char key_hash; char *dhchap_secret; + if (!strlen(secret)) { + if (set_ctrl) { + kfree(host->dhchap_ctrl_secret); + host->dhchap_ctrl_secret = NULL; + host->dhchap_ctrl_key_hash = 0; + } else { + kfree(host->dhchap_secret); + host->dhchap_secret = NULL; + host->dhchap_key_hash = 0; + } + return 0; + } if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1) return -EINVAL; if (key_hash > 3) { diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1eff8ca6a5f1..1b6264fa5803 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -578,8 +578,8 @@ static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; |