From 32a7627cf3a35396a8e834faf34e38ae9f3b1309 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:14 -0700 Subject: [PATCH] md: optimised resync using Bitmap based intent logging With this patch, the intent to write to some block in the array can be logged to a bitmap file. Each bit represents some number of sectors and is set before any update happens, and only cleared when all writes relating to all sectors are complete. After an unclean shutdown, information in this bitmap can be used to optimise resync - only sectors which could be out-of-sync need to be updated. Also if a drive is removed and then added back into an array, the recovery can make use of the bitmap to optimise reconstruction. This is not implemented in this patch. Currently the bitmap is stored in a file which must (obviously) be stored on a separate device. The patch only provided infrastructure. It does not update any personalities to bitmap intent logging. Md arrays can still be used with no bitmap file. This patch has minimal impact on such arrays. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 280 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 include/linux/raid/bitmap.h (limited to 'include/linux/raid/bitmap.h') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h new file mode 100644 index 000000000000..f785cf26cbad --- /dev/null +++ b/include/linux/raid/bitmap.h @@ -0,0 +1,280 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR 3 +#define BITMAP_MINOR 38 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ + BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */ +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __u32 magic; /* 0 BITMAP_MAGIC */ + __u32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __u64 events; /* 24 event counter for the bitmap (1)*/ + __u64 events_cleared;/*32 event counter when last bit cleared (2) */ + __u64 sync_size; /* 40 the size of the md device's sync range(3) */ + __u32 state; /* 48 bitmap state information */ + __u32 chunksize; /* 52 the bitmap chunk size in bytes */ + __u32 daemon_sleep; /* 56 seconds between disk flushes */ + + __u8 pad[256 - 60]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * count of dirty bits on the page + */ + unsigned int count:31; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + + /* bitmap spinlock */ + spinlock_t lock; + + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + + unsigned long flags; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long daemon_sleep; /* how many seconds between updates? */ + + /* + * bitmap write daemon - this daemon performs writes to the bitmap file + * this thread is only needed because of a limitation in ext3 (jbd) + * that does not allow a task to have two journal transactions ongoing + * simultaneously (even if the transactions are for two different + * filesystems) -- in the case of bitmap, that would be the filesystem + * that the bitmap file resides on and the filesystem that is mounted + * on the md device -- see current->journal_info in jbd/transaction.c + */ + mdk_thread_t *writeback_daemon; + spinlock_t write_lock; + struct semaphore write_ready; + struct semaphore write_done; + unsigned long writes_pending; + wait_queue_head_t write_wait; + struct list_head write_pages; + struct list_head complete_pages; + mempool_t *write_pool; +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); +int bitmap_active(struct bitmap *bitmap); + +char *file_path(struct file *file, char *buf, int count); +void bitmap_print_sb(struct bitmap *bitmap); +int bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); + +int bitmap_unplug(struct bitmap *bitmap); +int bitmap_daemon_work(struct bitmap *bitmap); +#endif + +#endif -- cgit v1.2.3 From 77ad4bc706fe6c52ab953f31c287a6af712d080c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:21 -0700 Subject: [PATCH] md: enable the bitmap write-back daemon and wait for it. Currently we don't wait for updates to the bitmap to be flushed to disk properly. The infrastructure all there, but it isn't being used.... A separate kernel thread (bitmap_writeback_daemon) is needed to wait for each page as we cannot get callbacks when a page write completes. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/bitmap.c | 119 ++++++++++++++++++++------------------------ include/linux/raid/bitmap.h | 13 +---- 2 files changed, 55 insertions(+), 77 deletions(-) (limited to 'include/linux/raid/bitmap.h') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 9462fdd517c0..86b6b037fa44 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -261,30 +261,33 @@ char *file_path(struct file *file, char *buf, int count) /* * write out a page */ -static int write_page(struct page *page, int wait) +static int write_page(struct bitmap *bitmap, struct page *page, int wait) { int ret = -ENOMEM; lock_page(page); - if (page->mapping == NULL) - goto unlock_out; - else if (i_size_read(page->mapping->host) < page->index << PAGE_SHIFT) { - ret = -ENOENT; - goto unlock_out; - } - ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE); if (!ret) ret = page->mapping->a_ops->commit_write(NULL, page, 0, PAGE_SIZE); if (ret) { -unlock_out: unlock_page(page); return ret; } set_page_dirty(page); /* force it to be written out */ + + if (!wait) { + /* add to list to be waited for by daemon */ + struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); + item->page = page; + page_cache_get(page); + spin_lock(&bitmap->write_lock); + list_add(&item->list, &bitmap->complete_pages); + spin_unlock(&bitmap->write_lock); + md_wakeup_thread(bitmap->writeback_daemon); + } return write_one_page(page, wait); } @@ -343,14 +346,13 @@ int bitmap_update_sb(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); return 0; } - page_cache_get(bitmap->sb_page); spin_unlock_irqrestore(&bitmap->lock, flags); sb = (bitmap_super_t *)kmap(bitmap->sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (!bitmap->mddev->degraded) sb->events_cleared = cpu_to_le64(bitmap->mddev->events); kunmap(bitmap->sb_page); - return write_page(bitmap->sb_page, 0); + return write_page(bitmap, bitmap->sb_page, 0); } /* print out the bitmap file superblock */ @@ -556,10 +558,10 @@ static void bitmap_file_unmap(struct bitmap *bitmap) static void bitmap_stop_daemons(struct bitmap *bitmap); /* dequeue the next item in a page list -- don't call from irq context */ -static struct page_list *dequeue_page(struct bitmap *bitmap, - struct list_head *head) +static struct page_list *dequeue_page(struct bitmap *bitmap) { struct page_list *item = NULL; + struct list_head *head = &bitmap->complete_pages; spin_lock(&bitmap->write_lock); if (list_empty(head)) @@ -573,23 +575,15 @@ out: static void drain_write_queues(struct bitmap *bitmap) { - struct list_head *queues[] = { &bitmap->complete_pages, NULL }; - struct list_head *head; struct page_list *item; - int i; - for (i = 0; queues[i]; i++) { - head = queues[i]; - while ((item = dequeue_page(bitmap, head))) { - page_cache_release(item->page); - mempool_free(item, bitmap->write_pool); - } + while ((item = dequeue_page(bitmap))) { + /* don't bother to wait */ + page_cache_release(item->page); + mempool_free(item, bitmap->write_pool); } - spin_lock(&bitmap->write_lock); - bitmap->writes_pending = 0; /* make sure waiters continue */ wake_up(&bitmap->write_wait); - spin_unlock(&bitmap->write_lock); } static void bitmap_file_put(struct bitmap *bitmap) @@ -734,13 +728,13 @@ int bitmap_unplug(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) - if (write_page(page, 0)) + if (write_page(bitmap, page, 0)) return 1; } if (wait) { /* if any writes were performed, we need to wait on them */ spin_lock_irq(&bitmap->write_lock); wait_event_lock_irq(bitmap->write_wait, - bitmap->writes_pending == 0, bitmap->write_lock, + list_empty(&bitmap->complete_pages), bitmap->write_lock, wake_up_process(bitmap->writeback_daemon->tsk)); spin_unlock_irq(&bitmap->write_lock); } @@ -841,7 +835,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync) */ memset(page_address(page) + offset, 0xff, PAGE_SIZE - offset); - ret = write_page(page, 1); + ret = write_page(bitmap, page, 1); if (ret) { kunmap(page); /* release, page not in filemap yet */ @@ -934,7 +928,7 @@ int bitmap_daemon_work(struct bitmap *bitmap) } spin_unlock_irqrestore(&bitmap->lock, flags); if (attr & BITMAP_PAGE_NEEDWRITE) { - if (write_page(page, 0)) + if (write_page(bitmap, page, 0)) bitmap_file_kick(bitmap); page_cache_release(page); } @@ -950,7 +944,7 @@ int bitmap_daemon_work(struct bitmap *bitmap) if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); - err = write_page(lastpage, 0); + err = write_page(bitmap, lastpage, 0); } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); @@ -998,7 +992,7 @@ int bitmap_daemon_work(struct bitmap *bitmap) if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); - err = write_page(lastpage, 0); + err = write_page(bitmap, lastpage, 0); } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1034,46 +1028,40 @@ static void bitmap_writeback_daemon(mddev_t *mddev) struct page_list *item; int err = 0; - while (1) { - PRINTK("%s: bitmap writeback daemon waiting...\n", bmname(bitmap)); - down_interruptible(&bitmap->write_done); - if (signal_pending(current)) { - printk(KERN_INFO - "%s: bitmap writeback daemon got signal, exiting...\n", - bmname(bitmap)); - break; - } + if (signal_pending(current)) { + printk(KERN_INFO + "%s: bitmap writeback daemon got signal, exiting...\n", + bmname(bitmap)); + err = -EINTR; + goto out; + } - PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); - /* wait on bitmap page writebacks */ - while ((item = dequeue_page(bitmap, &bitmap->complete_pages))) { - page = item->page; - mempool_free(item, bitmap->write_pool); - PRINTK("wait on page writeback: %p %lu\n", page, bitmap->writes_pending); - wait_on_page_writeback(page); - PRINTK("finished page writeback: %p %lu\n", page, bitmap->writes_pending); - spin_lock(&bitmap->write_lock); - if (!--bitmap->writes_pending) - wake_up(&bitmap->write_wait); - spin_unlock(&bitmap->write_lock); - err = PageError(page); - page_cache_release(page); - if (err) { - printk(KERN_WARNING "%s: bitmap file writeback " - "failed (page %lu): %d\n", - bmname(bitmap), page->index, err); - bitmap_file_kick(bitmap); - goto out; - } + PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); + /* wait on bitmap page writebacks */ + while ((item = dequeue_page(bitmap))) { + page = item->page; + mempool_free(item, bitmap->write_pool); + PRINTK("wait on page writeback: %p\n", page); + wait_on_page_writeback(page); + PRINTK("finished page writeback: %p\n", page); + + err = PageError(page); + page_cache_release(page); + if (err) { + printk(KERN_WARNING "%s: bitmap file writeback " + "failed (page %lu): %d\n", + bmname(bitmap), page->index, err); + bitmap_file_kick(bitmap); + goto out; } } -out: + out: + wake_up(&bitmap->write_wait); if (err) { printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n", - bmname(bitmap), err); + bmname(bitmap), err); daemon_exit(bitmap, &bitmap->writeback_daemon); } - return; } static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, @@ -1375,7 +1363,7 @@ int bitmap_setallbits(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); memset(kmap(page), 0xff, PAGE_SIZE); kunmap(page); - if (write_page(page, 0)) + if (write_page(bitmap, page, 0)) return 1; } @@ -1443,7 +1431,6 @@ int bitmap_create(mddev_t *mddev) mddev->bitmap = bitmap; spin_lock_init(&bitmap->write_lock); - init_MUTEX_LOCKED(&bitmap->write_done); INIT_LIST_HEAD(&bitmap->complete_pages); init_waitqueue_head(&bitmap->write_wait); bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index f785cf26cbad..cfe60cfc8f3d 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -233,21 +233,12 @@ struct bitmap { unsigned long daemon_sleep; /* how many seconds between updates? */ /* - * bitmap write daemon - this daemon performs writes to the bitmap file - * this thread is only needed because of a limitation in ext3 (jbd) - * that does not allow a task to have two journal transactions ongoing - * simultaneously (even if the transactions are for two different - * filesystems) -- in the case of bitmap, that would be the filesystem - * that the bitmap file resides on and the filesystem that is mounted - * on the md device -- see current->journal_info in jbd/transaction.c + * bitmap_writeback_daemon waits for file-pages that have been written, + * as there is no way to get a call-back when a page write completes. */ mdk_thread_t *writeback_daemon; spinlock_t write_lock; - struct semaphore write_ready; - struct semaphore write_done; - unsigned long writes_pending; wait_queue_head_t write_wait; - struct list_head write_pages; struct list_head complete_pages; mempool_t *write_pool; }; -- cgit v1.2.3 From a654b9d8f851f4ca02649d5825cbe6c608adb10c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:27 -0700 Subject: [PATCH] md: allow md intent bitmap to be stored near the superblock. This provides an alternate to storing the bitmap in a separate file. The bitmap can be stored at a given offset from the superblock. Obviously the creator of the array must make sure this doesn't intersect with data.... After is good for version-0.90 superblocks. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/bitmap.c | 132 +++++++++++++++++++++++++++++++++++--------- drivers/md/md.c | 40 +++++++++++++- include/linux/raid/bitmap.h | 2 + include/linux/raid/md.h | 15 ++++- include/linux/raid/md_k.h | 4 ++ include/linux/raid/md_p.h | 7 ++- 6 files changed, 170 insertions(+), 30 deletions(-) (limited to 'include/linux/raid/bitmap.h') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 204564dc6a0d..030d6861051a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -116,7 +116,7 @@ static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) if (!page) printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); else - printk("%s: bitmap_alloc_page: allocated page at %p\n", + PRINTK("%s: bitmap_alloc_page: allocated page at %p\n", bmname(bitmap), page); return page; } @@ -258,13 +258,61 @@ char *file_path(struct file *file, char *buf, int count) * basic page I/O operations */ +/* IO operations when bitmap is stored near all superblocks */ +static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long index) +{ + /* choose a good rdev and read the page from there */ + + mdk_rdev_t *rdev; + struct list_head *tmp; + struct page *page = alloc_page(GFP_KERNEL); + sector_t target; + + if (!page) + return ERR_PTR(-ENOMEM); + do { + ITERATE_RDEV(mddev, rdev, tmp) + if (rdev->in_sync && !rdev->faulty) + goto found; + return ERR_PTR(-EIO); + + found: + target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); + + } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)); + + page->index = index; + return page; +} + +static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev, rdev, tmp) + if (rdev->in_sync && !rdev->faulty) + md_super_write(mddev, rdev, + (rdev->sb_offset<<1) + offset + + page->index * (PAGE_SIZE/512), + PAGE_SIZE, + page); + + if (wait) + wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + return 0; +} + /* - * write out a page + * write out a page to a file */ static int write_page(struct bitmap *bitmap, struct page *page, int wait) { int ret = -ENOMEM; + if (bitmap->file == NULL) + return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); + lock_page(page); ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE); @@ -394,7 +442,12 @@ static int bitmap_read_sb(struct bitmap *bitmap) int err = -EINVAL; /* page 0 is the superblock, read it... */ - bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read); + if (bitmap->file) + bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read); + else { + bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0); + bytes_read = PAGE_SIZE; + } if (IS_ERR(bitmap->sb_page)) { err = PTR_ERR(bitmap->sb_page); bitmap->sb_page = NULL; @@ -625,14 +678,16 @@ static void bitmap_file_kick(struct bitmap *bitmap) bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET); bitmap_update_sb(bitmap); - path = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (path) - ptr = file_path(bitmap->file, path, PAGE_SIZE); + if (bitmap->file) { + path = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (path) + ptr = file_path(bitmap->file, path, PAGE_SIZE); - printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), ptr ? ptr : ""); + printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", + bmname(bitmap), ptr ? ptr : ""); - kfree(path); + kfree(path); + } bitmap_file_put(bitmap); @@ -676,7 +731,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) void *kaddr; unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); - if (!bitmap->file || !bitmap->filemap) { + if (!bitmap->filemap) { return; } @@ -715,7 +770,7 @@ int bitmap_unplug(struct bitmap *bitmap) * flushed out to disk */ for (i = 0; i < bitmap->file_pages; i++) { spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->file || !bitmap->filemap) { + if (!bitmap->filemap) { spin_unlock_irqrestore(&bitmap->lock, flags); return 0; } @@ -732,11 +787,15 @@ int bitmap_unplug(struct bitmap *bitmap) return 1; } if (wait) { /* if any writes were performed, we need to wait on them */ - spin_lock_irq(&bitmap->write_lock); - wait_event_lock_irq(bitmap->write_wait, - list_empty(&bitmap->complete_pages), bitmap->write_lock, - wake_up_process(bitmap->writeback_daemon->tsk)); - spin_unlock_irq(&bitmap->write_lock); + if (bitmap->file) { + spin_lock_irq(&bitmap->write_lock); + wait_event_lock_irq(bitmap->write_wait, + list_empty(&bitmap->complete_pages), bitmap->write_lock, + wake_up_process(bitmap->writeback_daemon->tsk)); + spin_unlock_irq(&bitmap->write_lock); + } else + wait_event(bitmap->mddev->sb_wait, + atomic_read(&bitmap->mddev->pending_writes)==0); } return 0; } @@ -764,7 +823,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync) chunks = bitmap->chunks; file = bitmap->file; - BUG_ON(!file); + BUG_ON(!file && !bitmap->offset); #if INJECT_FAULTS_3 outofdate = 1; @@ -779,7 +838,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync) num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; - if (i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { + if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", bmname(bitmap), (unsigned long) i_size_read(file->f_mapping->host), @@ -816,14 +875,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync) */ page = bitmap->sb_page; offset = sizeof(bitmap_super_t); - } else { + } else if (file) { page = read_page(file, index, &dummy); - if (IS_ERR(page)) { /* read error */ - ret = PTR_ERR(page); - goto out; - } + offset = 0; + } else { + page = read_sb_page(bitmap->mddev, bitmap->offset, index); offset = 0; } + if (IS_ERR(page)) { /* read error */ + ret = PTR_ERR(page); + goto out; + } + oldindex = index; oldpage = page; kmap(page); @@ -874,6 +937,19 @@ out: return ret; } +void bitmap_write_all(struct bitmap *bitmap) +{ + /* We don't actually write all bitmap blocks here, + * just flag them as needing to be written + */ + + unsigned long chunks = bitmap->chunks; + unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t); + unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE; + while (num_pages--) + bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE; +} + static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) { @@ -913,7 +989,7 @@ int bitmap_daemon_work(struct bitmap *bitmap) for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->file || !bitmap->filemap) { + if (!bitmap->filemap) { /* error or shutdown */ spin_unlock_irqrestore(&bitmap->lock, flags); break; @@ -1072,6 +1148,7 @@ static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, spin_lock_irqsave(&bitmap->lock, flags); *ptr = NULL; + if (!bitmap->file) /* no need for daemon if there's no backing file */ goto out_unlock; @@ -1416,9 +1493,11 @@ int bitmap_create(mddev_t *mddev) BUG_ON(sizeof(bitmap_super_t) != 256); - if (!file) /* bitmap disabled, nothing to do */ + if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ return 0; + BUG_ON(file && mddev->bitmap_offset); + bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return -ENOMEM; @@ -1438,7 +1517,8 @@ int bitmap_create(mddev_t *mddev) return -ENOMEM; bitmap->file = file; - get_file(file); + bitmap->offset = mddev->bitmap_offset; + if (file) get_file(file); /* read superblock from bitmap file (this sets bitmap->chunksize) */ err = bitmap_read_sb(bitmap); if (err) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7075bebb7f37..fde8acfac320 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -337,7 +337,7 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) return 0; } -static int sync_page_io(struct block_device *bdev, sector_t sector, int size, +int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw) { struct bio *bio = bio_alloc(GFP_NOIO, 1); @@ -609,6 +609,17 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) memcpy(mddev->uuid+12,&sb->set_uuid3, 4); mddev->max_disks = MD_SB_DISKS; + + if (sb->state & (1<bitmap_file == NULL) { + if (mddev->level != 1) { + /* FIXME use a better test */ + printk(KERN_WARNING "md: bitmaps only support for raid1\n"); + return -EINVAL; + } + mddev->bitmap_offset = (MD_SB_BYTES >> 9); + } + } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling */ __u64 ev1 = md_event(sb); @@ -702,6 +713,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_size; + if (mddev->bitmap && mddev->bitmap_file == NULL) + sb->state |= (1<disks[0].state = (1<uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; + + if ((le32_to_cpu(sb->feature_map) & 1) && + mddev->bitmap_file == NULL ) { + if (mddev->level != 1) { + printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); + return -EINVAL; + } + mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); + } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling */ __u64 ev1 = le64_to_cpu(sb->events); @@ -960,6 +983,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) else sb->resync_offset = cpu_to_le64(0); + if (mddev->bitmap && mddev->bitmap_file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); + sb->feature_map = cpu_to_le32(1); + } + max_dev = 0; ITERATE_RDEV(mddev,rdev2,tmp) if (rdev2->desc_nr+1 > max_dev) @@ -2406,7 +2434,8 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mdname(mddev)); fput(mddev->bitmap_file); mddev->bitmap_file = NULL; - } + } else + mddev->bitmap_offset = 0; /* file overrides offset */ return err; } @@ -3774,6 +3803,13 @@ void md_check_recovery(mddev_t *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (!spares) set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + if (spares && mddev->bitmap && ! mddev->bitmap->file) { + /* We are adding a device or devices to an array + * which has the bitmap stored on all devices. + * So make sure all bitmap pages get written + */ + bitmap_write_all(mddev->bitmap); + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "%s_resync"); diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index cfe60cfc8f3d..e24b74b11150 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -217,6 +217,7 @@ struct bitmap { /* bitmap spinlock */ spinlock_t lock; + long offset; /* offset from superblock if file is NULL */ struct file *file; /* backing disk file */ struct page *sb_page; /* cached copy of the bitmap file superblock */ struct page **filemap; /* list of cache pages for the file */ @@ -255,6 +256,7 @@ void bitmap_print_sb(struct bitmap *bitmap); int bitmap_update_sb(struct bitmap *bitmap); int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); /* these are exported */ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 75f41d8faed2..ffa316ce4dc8 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -60,7 +60,14 @@ */ #define MD_MAJOR_VERSION 0 #define MD_MINOR_VERSION 90 -#define MD_PATCHLEVEL_VERSION 1 +/* + * MD_PATCHLEVEL_VERSION indicates kernel functionality. + * >=1 means different superblock formats are selectable using SET_ARRAY_INFO + * and major_version/minor_version accordingly + * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT + * in the super status byte + */ +#define MD_PATCHLEVEL_VERSION 2 extern int register_md_personality (int p_num, mdk_personality_t *p); extern int unregister_md_personality (int p_num); @@ -78,6 +85,12 @@ extern void md_unplug_mddev(mddev_t *mddev); extern void md_print_devices (void); +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); + + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } #endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 3e977025cf43..a3725b57fb7d 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -273,6 +273,10 @@ struct mddev_s struct bitmap *bitmap; /* the bitmap for the device */ struct file *bitmap_file; /* the bitmap file */ + long bitmap_offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + */ struct list_head all_mddevs; }; diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 8ba95d67329f..8e592a25a8b5 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -96,6 +96,7 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_CLEAN 0 #define MD_SB_ERRORS 1 +#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ typedef struct mdp_superblock_s { /* * Constant generic information @@ -184,7 +185,7 @@ struct mdp_superblock_1 { /* constant array information - 128 bytes */ __u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ __u32 major_version; /* 1 */ - __u32 feature_map; /* 0 for now */ + __u32 feature_map; /* bit 0 set if 'bitmap_offset' is meaningful */ __u32 pad0; /* always set to 0 when writing */ __u8 set_uuid[16]; /* user-space generated. */ @@ -197,6 +198,10 @@ struct mdp_superblock_1 { __u32 chunksize; /* in 512byte sectors */ __u32 raid_disks; + __u32 bitmap_offset; /* sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ __u8 pad1[128-96]; /* set to 0 when written */ /* constant this-device information - 64 bytes */ -- cgit v1.2.3