summaryrefslogtreecommitdiff
path: root/drivers/md/md-cluster.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md-cluster.c')
-rw-r--r--drivers/md/md-cluster.c171
1 files changed, 116 insertions, 55 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index fcfc4b9b2672..11e3bc9d2a4b 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -44,6 +44,8 @@ struct resync_info {
/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
+#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
+#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
struct md_cluster_info {
@@ -51,7 +53,6 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
- struct dlm_lock_resource *sb_lock;
struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
struct list_head suspend_list;
@@ -74,6 +75,7 @@ enum msg_type {
NEWDISK,
REMOVE,
RE_ADD,
+ BITMAP_NEEDS_SYNC,
};
struct cluster_msg {
@@ -98,7 +100,6 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
{
int ret = 0;
- init_completion(&res->completion);
ret = dlm_lock(res->ls, mode, &res->lksb,
res->flags, res->name, strlen(res->name),
0, sync_ast, res, res->bast);
@@ -123,6 +124,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
if (!res)
return NULL;
+ init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
namelen = strlen(name);
@@ -164,11 +166,24 @@ out_err:
static void lockres_free(struct dlm_lock_resource *res)
{
+ int ret;
+
if (!res)
return;
- init_completion(&res->completion);
- dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ /* cancel a lock request or a conversion request that is blocked */
+ res->flags |= DLM_LKF_CANCEL;
+retry:
+ ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ if (unlikely(ret != 0)) {
+ pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
+
+ /* if a lock conversion is cancelled, then the lock is put
+ * back to grant queue, need to ensure it is unlocked */
+ if (ret == -DLM_ECANCEL)
+ goto retry;
+ }
+ res->flags &= ~DLM_LKF_CANCEL;
wait_for_completion(&res->completion);
kfree(res->name);
@@ -176,18 +191,6 @@ static void lockres_free(struct dlm_lock_resource *res)
kfree(res);
}
-static char *pretty_uuid(char *dest, char *src)
-{
- int i, len = 0;
-
- for (i = 0; i < 16; i++) {
- if (i == 4 || i == 6 || i == 8 || i == 10)
- len += sprintf(dest + len, "-");
- len += sprintf(dest + len, "%02x", (__u8)src[i]);
- }
- return dest;
-}
-
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi)
{
@@ -275,18 +278,16 @@ clear_bit:
static void recover_prep(void *arg)
{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
-static void recover_slot(void *arg, struct dlm_slot *slot)
+static void __recover_slot(struct mddev *mddev, int slot)
{
- struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;
- pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
- mddev->bitmap_info.cluster_name,
- slot->nodeid, slot->slot,
- cinfo->slot_number);
- set_bit(slot->slot - 1, &cinfo->recovery_map);
+ set_bit(slot, &cinfo->recovery_map);
if (!cinfo->recovery_thread) {
cinfo->recovery_thread = md_register_thread(recover_bitmaps,
mddev, "recover");
@@ -298,6 +299,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
md_wakeup_thread(cinfo->recovery_thread);
}
+static void recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
+ mddev->bitmap_info.cluster_name,
+ slot->nodeid, slot->slot,
+ cinfo->slot_number);
+ /* deduct one since dlm slot starts from one while the num of
+ * cluster-md begins with 0 */
+ __recover_slot(mddev, slot->slot - 1);
+}
+
static void recover_done(void *arg, struct dlm_slot *slots,
int num_slots, int our_slot,
uint32_t generation)
@@ -306,9 +321,17 @@ static void recover_done(void *arg, struct dlm_slot *slots,
struct md_cluster_info *cinfo = mddev->cluster_info;
cinfo->slot_number = our_slot;
- complete(&cinfo->completion);
+ /* completion is only need to be complete when node join cluster,
+ * it doesn't need to run during another node's failure */
+ if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
+ complete(&cinfo->completion);
+ clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+ }
+ clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
+/* the ops is called when node join the cluster, and do lock recovery
+ * if node failure occurs */
static const struct dlm_lockspace_ops md_ls_ops = {
.recover_prep = recover_prep,
.recover_slot = recover_slot,
@@ -383,7 +406,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
int len;
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
- pretty_uuid(disk_uuid + len, cmsg->uuid);
+ sprintf(disk_uuid + len, "%pU", cmsg->uuid);
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
@@ -452,6 +475,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
__func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
+ case BITMAP_NEEDS_SYNC:
+ pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
+ __func__, __LINE__, msg->slot);
+ __recover_slot(mddev, msg->slot);
+ break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
__func__, __LINE__, msg->slot);
@@ -467,6 +495,7 @@ static void recv_daemon(struct md_thread *thread)
struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
struct cluster_msg msg;
+ int ret;
/*get CR on Message*/
if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
@@ -479,13 +508,21 @@ static void recv_daemon(struct md_thread *thread)
process_recvd_msg(thread->mddev, &msg);
/*release CR on ack_lockres*/
- dlm_unlock_sync(ack_lockres);
- /*up-convert to EX on message_lockres*/
- dlm_lock_sync(message_lockres, DLM_LOCK_EX);
+ ret = dlm_unlock_sync(ack_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock ack failed return %d\n", ret);
+ /*up-convert to PR on message_lockres*/
+ ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
+ if (unlikely(ret != 0))
+ pr_info("lock PR on msg failed return %d\n", ret);
/*get CR on ack_lockres again*/
- dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ if (unlikely(ret != 0))
+ pr_info("lock CR on ack failed return %d\n", ret);
/*release CR on message_lockres*/
- dlm_unlock_sync(message_lockres);
+ ret = dlm_unlock_sync(message_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock msg failed return %d\n", ret);
}
/* lock_comm()
@@ -514,7 +551,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
* The function:
* 1. Grabs the message lockresource in EX mode
* 2. Copies the message to the message LVB
- * 3. Downconverts message lockresource to CR
+ * 3. Downconverts message lockresource to CW
* 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
* and the other nodes read the message. The thread will wait here until all other
* nodes have released ack lock resource.
@@ -535,12 +572,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
sizeof(struct cluster_msg));
- /*down-convert EX to CR on Message*/
- error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
+ /*down-convert EX to CW on Message*/
+ error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
if (error) {
- pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
+ pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
error);
- goto failed_message;
+ goto failed_ack;
}
/*up-convert CR to EX on Ack*/
@@ -560,7 +597,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
- dlm_unlock_sync(cinfo->message_lockres);
+ error = dlm_unlock_sync(cinfo->message_lockres);
+ if (unlikely(error != 0)) {
+ pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
+ error);
+ /* in case the message can't be released due to some reason */
+ goto failed_ack;
+ }
failed_message:
return error;
}
@@ -582,6 +625,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
struct dlm_lock_resource *bm_lockres;
struct suspend_info *s;
char str[64];
+ sector_t lo, hi;
for (i = 0; i < total_slots; i++) {
@@ -612,9 +656,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
lockres_free(bm_lockres);
continue;
}
- if (ret)
+ if (ret) {
+ lockres_free(bm_lockres);
goto out;
- /* TODO: Read the disk bitmap sb and check if it needs recovery */
+ }
+
+ /* Read the disk bitmap sb and check if it needs recovery */
+ ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
+ if (ret) {
+ pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
+ lockres_free(bm_lockres);
+ continue;
+ }
+ if ((hi > 0) && (lo < mddev->recovery_cp)) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ mddev->recovery_cp = lo;
+ md_check_recovery(mddev);
+ }
+
dlm_unlock_sync(bm_lockres);
lockres_free(bm_lockres);
}
@@ -628,20 +687,20 @@ static int join(struct mddev *mddev, int nodes)
int ret, ops_rv;
char str[64];
- if (!try_module_get(THIS_MODULE))
- return -ENOENT;
-
cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
if (!cinfo)
return -ENOMEM;
+ INIT_LIST_HEAD(&cinfo->suspend_list);
+ spin_lock_init(&cinfo->suspend_lock);
init_completion(&cinfo->completion);
+ set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
- pretty_uuid(str, mddev->uuid);
+ sprintf(str, "%pU", mddev->uuid);
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
DLM_LSFL_FS, LVB_SIZE,
&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
@@ -654,12 +713,6 @@ static int join(struct mddev *mddev, int nodes)
ret = -ERANGE;
goto err;
}
- cinfo->sb_lock = lockres_init(mddev, "cmd-super",
- NULL, 0);
- if (!cinfo->sb_lock) {
- ret = -ENOMEM;
- goto err;
- }
/* Initiate the communication resources */
ret = -ENOMEM;
cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
@@ -700,9 +753,6 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
- INIT_LIST_HEAD(&cinfo->suspend_list);
- spin_lock_init(&cinfo->suspend_lock);
-
ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
@@ -714,12 +764,10 @@ err:
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->bitmap_lockres);
- lockres_free(cinfo->sb_lock);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
mddev->cluster_info = NULL;
kfree(cinfo);
- module_put(THIS_MODULE);
return ret;
}
@@ -735,7 +783,6 @@ static int leave(struct mddev *mddev)
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
- lockres_free(cinfo->sb_lock);
lockres_free(cinfo->bitmap_lockres);
dlm_release_lockspace(cinfo->lockspace, 2);
return 0;
@@ -812,16 +859,30 @@ static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
static void resync_finish(struct mddev *mddev)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ int slot = cinfo->slot_number - 1;
+
pr_info("%s:%d\n", __func__, __LINE__);
resync_send(mddev, RESYNCING, 0, 0);
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
+ cmsg.slot = cpu_to_le32(slot);
+ sendmsg(cinfo, &cmsg);
+ }
}
-static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
+static int area_resyncing(struct mddev *mddev, int direction,
+ sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret = 0;
struct suspend_info *s;
+ if ((direction == READ) &&
+ test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
+ return 1;
+
spin_lock_irq(&cinfo->suspend_lock);
if (list_empty(&cinfo->suspend_list))
goto out;