summaryrefslogtreecommitdiff
path: root/drivers/hv
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv')
-rw-r--r--drivers/hv/channel.c103
-rw-r--r--drivers/hv/channel_mgmt.c180
-rw-r--r--drivers/hv/connection.c35
-rw-r--r--drivers/hv/hv.c34
-rw-r--r--drivers/hv/hv_balloon.c82
-rw-r--r--drivers/hv/hv_util.c13
-rw-r--r--drivers/hv/hyperv_vmbus.h14
-rw-r--r--drivers/hv/vmbus_drv.c115
8 files changed, 403 insertions, 173 deletions
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 2978f5ee8d2a..da53180f5eb4 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -71,7 +71,8 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
struct vmbus_channel_msginfo *open_info = NULL;
void *in, *out;
unsigned long flags;
- int ret, t, err = 0;
+ int ret, err = 0;
+ unsigned long t;
spin_lock_irqsave(&newchannel->lock, flags);
if (newchannel->state == CHANNEL_OPEN_STATE) {
@@ -89,9 +90,10 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
get_order(send_ringbuffer_size + recv_ringbuffer_size));
- if (!out)
- return -ENOMEM;
-
+ if (!out) {
+ err = -ENOMEM;
+ goto error0;
+ }
in = (void *)((unsigned long)out + send_ringbuffer_size);
@@ -135,7 +137,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
GFP_KERNEL);
if (!open_info) {
err = -ENOMEM;
- goto error0;
+ goto error_gpadl;
}
init_completion(&open_info->waitevent);
@@ -151,7 +153,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
if (userdatalen > MAX_USER_DEFINED_BYTES) {
err = -EINVAL;
- goto error0;
+ goto error_gpadl;
}
if (userdatalen)
@@ -195,10 +197,14 @@ error1:
list_del(&open_info->msglistentry);
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+error_gpadl:
+ vmbus_teardown_gpadl(newchannel, newchannel->ringbuffer_gpadlhandle);
+
error0:
free_pages((unsigned long)out,
get_order(send_ringbuffer_size + recv_ringbuffer_size));
kfree(open_info);
+ newchannel->state = CHANNEL_OPEN_STATE;
return err;
}
EXPORT_SYMBOL_GPL(vmbus_open);
@@ -495,6 +501,15 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
put_cpu();
}
+ /*
+ * If the channel has been rescinded; process device removal.
+ */
+ if (channel->rescind) {
+ hv_process_channel_removal(channel,
+ channel->offermsg.child_relid);
+ return 0;
+ }
+
/* Send a closing message */
msg = &channel->close_msg.msg;
@@ -569,23 +584,9 @@ void vmbus_close(struct vmbus_channel *channel)
}
EXPORT_SYMBOL_GPL(vmbus_close);
-/**
- * vmbus_sendpacket() - Send the specified buffer on the given channel
- * @channel: Pointer to vmbus_channel structure.
- * @buffer: Pointer to the buffer you want to receive the data into.
- * @bufferlen: Maximum size of what the the buffer will hold
- * @requestid: Identifier of the request
- * @type: Type of packet that is being send e.g. negotiate, time
- * packet etc.
- *
- * Sends data in @buffer directly to hyper-v via the vmbus
- * This will send the data unparsed to hyper-v.
- *
- * Mainly used by Hyper-V drivers.
- */
-int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
+int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
u32 bufferlen, u64 requestid,
- enum vmbus_packet_type type, u32 flags)
+ enum vmbus_packet_type type, u32 flags, bool kick_q)
{
struct vmpacket_descriptor desc;
u32 packetlen = sizeof(struct vmpacket_descriptor) + bufferlen;
@@ -613,21 +614,49 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal);
- if (ret == 0 && signal)
+ if ((ret == 0) && kick_q && signal)
vmbus_setevent(channel);
return ret;
}
+EXPORT_SYMBOL(vmbus_sendpacket_ctl);
+
+/**
+ * vmbus_sendpacket() - Send the specified buffer on the given channel
+ * @channel: Pointer to vmbus_channel structure.
+ * @buffer: Pointer to the buffer you want to receive the data into.
+ * @bufferlen: Maximum size of what the the buffer will hold
+ * @requestid: Identifier of the request
+ * @type: Type of packet that is being send e.g. negotiate, time
+ * packet etc.
+ *
+ * Sends data in @buffer directly to hyper-v via the vmbus
+ * This will send the data unparsed to hyper-v.
+ *
+ * Mainly used by Hyper-V drivers.
+ */
+int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
+ u32 bufferlen, u64 requestid,
+ enum vmbus_packet_type type, u32 flags)
+{
+ return vmbus_sendpacket_ctl(channel, buffer, bufferlen, requestid,
+ type, flags, true);
+}
EXPORT_SYMBOL(vmbus_sendpacket);
/*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type.
+ * vmbus_sendpacket_pagebuffer_ctl - Send a range of single-page buffer
+ * packets using a GPADL Direct packet type. This interface allows you
+ * to control notifying the host. This will be useful for sending
+ * batched data. Also the sender can control the send flags
+ * explicitly.
*/
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
+int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
struct hv_page_buffer pagebuffers[],
u32 pagecount, void *buffer, u32 bufferlen,
- u64 requestid)
+ u64 requestid,
+ u32 flags,
+ bool kick_q)
{
int ret;
int i;
@@ -655,7 +684,7 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
/* Setup the descriptor */
desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
- desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
+ desc.flags = flags;
desc.dataoffset8 = descsize >> 3; /* in 8-bytes grandularity */
desc.length8 = (u16)(packetlen_aligned >> 3);
desc.transactionid = requestid;
@@ -676,11 +705,27 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal);
- if (ret == 0 && signal)
+ if ((ret == 0) && kick_q && signal)
vmbus_setevent(channel);
return ret;
}
+
+/*
+ * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
+ * packets using a GPADL Direct packet type.
+ */
+int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
+ struct hv_page_buffer pagebuffers[],
+ u32 pagecount, void *buffer, u32 bufferlen,
+ u64 requestid)
+{
+ u32 flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
+ return vmbus_sendpacket_pagebuffer_ctl(channel, pagebuffers, pagecount,
+ buffer, bufferlen, requestid,
+ flags, true);
+
+}
EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
/*
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 3736f71bdec5..611789139f9b 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -134,24 +134,55 @@ fw_error:
EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
+static void vmbus_process_device_unregister(struct work_struct *work)
+{
+ struct device *dev;
+ struct vmbus_channel *channel = container_of(work,
+ struct vmbus_channel,
+ work);
+
+ dev = get_device(&channel->device_obj->device);
+ if (dev) {
+ vmbus_device_unregister(channel->device_obj);
+ put_device(dev);
+ }
+}
+
+static void vmbus_sc_creation_cb(struct work_struct *work)
+{
+ struct vmbus_channel *newchannel = container_of(work,
+ struct vmbus_channel,
+ work);
+ struct vmbus_channel *primary_channel = newchannel->primary_channel;
+
+ /*
+ * On entry sc_creation_callback has been already verified to
+ * be non-NULL.
+ */
+ primary_channel->sc_creation_callback(newchannel);
+}
+
/*
* alloc_channel - Allocate and initialize a vmbus channel object
*/
static struct vmbus_channel *alloc_channel(void)
{
+ static atomic_t chan_num = ATOMIC_INIT(0);
struct vmbus_channel *channel;
channel = kzalloc(sizeof(*channel), GFP_ATOMIC);
if (!channel)
return NULL;
+ channel->id = atomic_inc_return(&chan_num);
spin_lock_init(&channel->inbound_lock);
spin_lock_init(&channel->lock);
INIT_LIST_HEAD(&channel->sc_list);
INIT_LIST_HEAD(&channel->percpu_list);
- channel->controlwq = create_workqueue("hv_vmbus_ctl");
+ channel->controlwq = alloc_workqueue("hv_vmbus_ctl/%d", WQ_MEM_RECLAIM,
+ 1, channel->id);
if (!channel->controlwq) {
kfree(channel);
return NULL;
@@ -204,33 +235,21 @@ static void percpu_channel_deq(void *arg)
list_del(&channel->percpu_list);
}
-/*
- * vmbus_process_rescind_offer -
- * Rescind the offer by initiating a device removal
- */
-static void vmbus_process_rescind_offer(struct work_struct *work)
+
+void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
{
- struct vmbus_channel *channel = container_of(work,
- struct vmbus_channel,
- work);
+ struct vmbus_channel_relid_released msg;
unsigned long flags;
struct vmbus_channel *primary_channel;
- struct vmbus_channel_relid_released msg;
- struct device *dev;
-
- if (channel->device_obj) {
- dev = get_device(&channel->device_obj->device);
- if (dev) {
- vmbus_device_unregister(channel->device_obj);
- put_device(dev);
- }
- }
memset(&msg, 0, sizeof(struct vmbus_channel_relid_released));
- msg.child_relid = channel->offermsg.child_relid;
+ msg.child_relid = relid;
msg.header.msgtype = CHANNELMSG_RELID_RELEASED;
vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released));
+ if (channel == NULL)
+ return;
+
if (channel->target_cpu != get_cpu()) {
put_cpu();
smp_call_function_single(channel->target_cpu,
@@ -259,7 +278,6 @@ void vmbus_free_channels(void)
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
vmbus_device_unregister(channel->device_obj);
- kfree(channel->device_obj);
free_channel(channel);
}
}
@@ -268,11 +286,8 @@ void vmbus_free_channels(void)
* vmbus_process_offer - Process the offer by creating a channel/device
* associated with this offer
*/
-static void vmbus_process_offer(struct work_struct *work)
+static void vmbus_process_offer(struct vmbus_channel *newchannel)
{
- struct vmbus_channel *newchannel = container_of(work,
- struct vmbus_channel,
- work);
struct vmbus_channel *channel;
bool fnew = true;
bool enq = false;
@@ -335,10 +350,21 @@ static void vmbus_process_offer(struct work_struct *work)
}
newchannel->state = CHANNEL_OPEN_STATE;
+ channel->num_sc++;
if (channel->sc_creation_callback != NULL)
- channel->sc_creation_callback(newchannel);
-
- goto done_init_rescind;
+ /*
+ * We need to invoke the sub-channel creation
+ * callback; invoke this in a seperate work
+ * context since we are currently running on
+ * the global work context in which we handle
+ * messages from the host.
+ */
+ INIT_WORK(&newchannel->work,
+ vmbus_sc_creation_cb);
+ queue_work(newchannel->controlwq,
+ &newchannel->work);
+
+ return;
}
goto err_free_chan;
@@ -361,7 +387,7 @@ static void vmbus_process_offer(struct work_struct *work)
&newchannel->offermsg.offer.if_instance,
newchannel);
if (!newchannel->device_obj)
- goto err_free_chan;
+ goto err_deq_chan;
/*
* Add the new device to the bus. This will kick off device-driver
@@ -373,21 +399,26 @@ static void vmbus_process_offer(struct work_struct *work)
pr_err("unable to add child device object (relid %d)\n",
newchannel->offermsg.child_relid);
- spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
- list_del(&newchannel->listentry);
- spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
kfree(newchannel->device_obj);
- goto err_free_chan;
+ goto err_deq_chan;
}
-done_init_rescind:
- spin_lock_irqsave(&newchannel->lock, flags);
- /* The next possible work is rescind handling */
- INIT_WORK(&newchannel->work, vmbus_process_rescind_offer);
- /* Check if rescind offer was already received */
- if (newchannel->rescind)
- queue_work(newchannel->controlwq, &newchannel->work);
- spin_unlock_irqrestore(&newchannel->lock, flags);
+
return;
+
+err_deq_chan:
+ spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
+ list_del(&newchannel->listentry);
+ spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+
+ if (newchannel->target_cpu != get_cpu()) {
+ put_cpu();
+ smp_call_function_single(newchannel->target_cpu,
+ percpu_channel_deq, newchannel, true);
+ } else {
+ percpu_channel_deq(newchannel);
+ put_cpu();
+ }
+
err_free_chan:
free_channel(newchannel);
}
@@ -411,6 +442,8 @@ static const struct hv_vmbus_device_id hp_devs[] = {
{ HV_SCSI_GUID, },
/* Network */
{ HV_NIC_GUID, },
+ /* NetworkDirect Guest RDMA */
+ { HV_ND_GUID, },
};
@@ -511,8 +544,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
newchannel->monitor_grp = (u8)offer->monitorid / 32;
newchannel->monitor_bit = (u8)offer->monitorid % 32;
- INIT_WORK(&newchannel->work, vmbus_process_offer);
- queue_work(newchannel->controlwq, &newchannel->work);
+ vmbus_process_offer(newchannel);
}
/*
@@ -529,24 +561,28 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
rescind = (struct vmbus_channel_rescind_offer *)hdr;
channel = relid2channel(rescind->child_relid);
- if (channel == NULL)
- /* Just return here, no channel found */
+ if (channel == NULL) {
+ hv_process_channel_removal(NULL, rescind->child_relid);
return;
+ }
spin_lock_irqsave(&channel->lock, flags);
channel->rescind = true;
- /*
- * channel->work.func != vmbus_process_rescind_offer means we are still
- * processing offer request and the rescind offer processing should be
- * postponed. It will be done at the very end of vmbus_process_offer()
- * as rescind flag is being checked there.
- */
- if (channel->work.func == vmbus_process_rescind_offer)
- /* work is initialized for vmbus_process_rescind_offer() from
- * vmbus_process_offer() where the channel got created */
- queue_work(channel->controlwq, &channel->work);
-
spin_unlock_irqrestore(&channel->lock, flags);
+
+ if (channel->device_obj) {
+ /*
+ * We will have to unregister this device from the
+ * driver core. Do this in the per-channel work context.
+ * Note that we are currently executing on the global
+ * workq for handling messages from the host.
+ */
+ INIT_WORK(&channel->work, vmbus_process_device_unregister);
+ queue_work(channel->controlwq, &channel->work);
+ } else {
+ hv_process_channel_removal(channel,
+ channel->offermsg.child_relid);
+ }
}
/*
@@ -787,7 +823,8 @@ int vmbus_request_offers(void)
{
struct vmbus_channel_message_header *msg;
struct vmbus_channel_msginfo *msginfo;
- int ret, t;
+ int ret;
+ unsigned long t;
msginfo = kmalloc(sizeof(*msginfo) +
sizeof(struct vmbus_channel_message_header),
@@ -826,9 +863,8 @@ cleanup:
/*
* Retrieve the (sub) channel on which to send an outgoing request.
- * When a primary channel has multiple sub-channels, we choose a
- * channel whose VCPU binding is closest to the VCPU on which
- * this call is being made.
+ * When a primary channel has multiple sub-channels, we try to
+ * distribute the load equally amongst all available channels.
*/
struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
{
@@ -836,11 +872,19 @@ struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
int cur_cpu;
struct vmbus_channel *cur_channel;
struct vmbus_channel *outgoing_channel = primary;
- int cpu_distance, new_cpu_distance;
+ int next_channel;
+ int i = 1;
if (list_empty(&primary->sc_list))
return outgoing_channel;
+ next_channel = primary->next_oc++;
+
+ if (next_channel > (primary->num_sc)) {
+ primary->next_oc = 0;
+ return outgoing_channel;
+ }
+
cur_cpu = hv_context.vp_index[get_cpu()];
put_cpu();
list_for_each_safe(cur, tmp, &primary->sc_list) {
@@ -851,18 +895,10 @@ struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
if (cur_channel->target_vp == cur_cpu)
return cur_channel;
- cpu_distance = ((outgoing_channel->target_vp > cur_cpu) ?
- (outgoing_channel->target_vp - cur_cpu) :
- (cur_cpu - outgoing_channel->target_vp));
-
- new_cpu_distance = ((cur_channel->target_vp > cur_cpu) ?
- (cur_channel->target_vp - cur_cpu) :
- (cur_cpu - cur_channel->target_vp));
-
- if (cpu_distance < new_cpu_distance)
- continue;
+ if (i == next_channel)
+ return cur_channel;
- outgoing_channel = cur_channel;
+ i++;
}
return outgoing_channel;
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index a63a795300b9..583d7d42b46d 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -216,10 +216,21 @@ int vmbus_connect(void)
cleanup:
pr_err("Unable to connect to host\n");
+
vmbus_connection.conn_state = DISCONNECTED;
+ vmbus_disconnect();
+
+ kfree(msginfo);
- if (vmbus_connection.work_queue)
+ return ret;
+}
+
+void vmbus_disconnect(void)
+{
+ if (vmbus_connection.work_queue) {
+ drain_workqueue(vmbus_connection.work_queue);
destroy_workqueue(vmbus_connection.work_queue);
+ }
if (vmbus_connection.int_page) {
free_pages((unsigned long)vmbus_connection.int_page, 0);
@@ -230,10 +241,6 @@ cleanup:
free_pages((unsigned long)vmbus_connection.monitor_pages[1], 0);
vmbus_connection.monitor_pages[0] = NULL;
vmbus_connection.monitor_pages[1] = NULL;
-
- kfree(msginfo);
-
- return ret;
}
/*
@@ -311,10 +318,8 @@ static void process_chn_event(u32 relid)
*/
channel = pcpu_relid2channel(relid);
- if (!channel) {
- pr_err("channel not found for relid - %u\n", relid);
+ if (!channel)
return;
- }
/*
* A channel once created is persistent even when there
@@ -349,10 +354,7 @@ static void process_chn_event(u32 relid)
else
bytes_to_read = 0;
} while (read_state && (bytes_to_read != 0));
- } else {
- pr_err("no channel callback for relid - %u\n", relid);
}
-
}
/*
@@ -433,9 +435,16 @@ int vmbus_post_msg(void *buffer, size_t buflen)
ret = hv_post_message(conn_id, 1, buffer, buflen);
switch (ret) {
+ case HV_STATUS_INVALID_CONNECTION_ID:
+ /*
+ * We could get this if we send messages too
+ * frequently.
+ */
+ ret = -EAGAIN;
+ break;
+ case HV_STATUS_INSUFFICIENT_MEMORY:
case HV_STATUS_INSUFFICIENT_BUFFERS:
ret = -ENOMEM;
- case -ENOMEM:
break;
case HV_STATUS_SUCCESS:
return ret;
@@ -445,7 +454,7 @@ int vmbus_post_msg(void *buffer, size_t buflen)
}
retries++;
- msleep(100);
+ msleep(1000);
}
return ret;
}
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 50e51a51ff8b..d3943bceecc3 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -312,7 +312,11 @@ static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu)
dev->features = CLOCK_EVT_FEAT_ONESHOT;
dev->cpumask = cpumask_of(cpu);
dev->rating = 1000;
- dev->owner = THIS_MODULE;
+ /*
+ * Avoid settint dev->owner = THIS_MODULE deliberately as doing so will
+ * result in clockevents_config_and_register() taking additional
+ * references to the hv_vmbus module making it impossible to unload.
+ */
dev->set_mode = hv_ce_setmode;
dev->set_next_event = hv_ce_set_next_event;
@@ -470,6 +474,20 @@ void hv_synic_init(void *arg)
}
/*
+ * hv_synic_clockevents_cleanup - Cleanup clockevent devices
+ */
+void hv_synic_clockevents_cleanup(void)
+{
+ int cpu;
+
+ if (!(ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE))
+ return;
+
+ for_each_online_cpu(cpu)
+ clockevents_unbind_device(hv_context.clk_evt[cpu], cpu);
+}
+
+/*
* hv_synic_cleanup - Cleanup routine for hv_synic_init().
*/
void hv_synic_cleanup(void *arg)
@@ -477,11 +495,17 @@ void hv_synic_cleanup(void *arg)
union hv_synic_sint shared_sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
+ union hv_synic_scontrol sctrl;
int cpu = smp_processor_id();
if (!hv_context.synic_initialized)
return;
+ /* Turn off clockevent device */
+ if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE)
+ hv_ce_setmode(CLOCK_EVT_MODE_SHUTDOWN,
+ hv_context.clk_evt[cpu]);
+
rdmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
shared_sint.masked = 1;
@@ -502,6 +526,10 @@ void hv_synic_cleanup(void *arg)
wrmsrl(HV_X64_MSR_SIEFP, siefp.as_uint64);
- free_page((unsigned long)hv_context.synic_message_page[cpu]);
- free_page((unsigned long)hv_context.synic_event_page[cpu]);
+ /* Disable the global synic bit */
+ rdmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64);
+ sctrl.enable = 0;
+ wrmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64);
+
+ hv_synic_free_cpu(cpu);
}
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index ff169386b2c7..c5bb87244dd7 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -503,6 +503,8 @@ struct hv_dynmem_device {
* Number of pages we have currently ballooned out.
*/
unsigned int num_pages_ballooned;
+ unsigned int num_pages_onlined;
+ unsigned int num_pages_added;
/*
* State to manage the ballooning (up) operation.
@@ -534,7 +536,6 @@ struct hv_dynmem_device {
struct task_struct *thread;
struct mutex ha_region_mutex;
- struct completion waiter_event;
/*
* A list of hot-add regions.
@@ -554,46 +555,32 @@ static struct hv_dynmem_device dm_device;
static void post_status(struct hv_dynmem_device *dm);
#ifdef CONFIG_MEMORY_HOTPLUG
-static void acquire_region_mutex(bool trylock)
-{
- if (trylock) {
- reinit_completion(&dm_device.waiter_event);
- while (!mutex_trylock(&dm_device.ha_region_mutex))
- wait_for_completion(&dm_device.waiter_event);
- } else {
- mutex_lock(&dm_device.ha_region_mutex);
- }
-}
-
-static void release_region_mutex(bool trylock)
-{
- if (trylock) {
- mutex_unlock(&dm_device.ha_region_mutex);
- } else {
- mutex_unlock(&dm_device.ha_region_mutex);
- complete(&dm_device.waiter_event);
- }
-}
-
static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
void *v)
{
+ struct memory_notify *mem = (struct memory_notify *)v;
+
switch (val) {
case MEM_GOING_ONLINE:
- acquire_region_mutex(true);
+ mutex_lock(&dm_device.ha_region_mutex);
break;
case MEM_ONLINE:
+ dm_device.num_pages_onlined += mem->nr_pages;
case MEM_CANCEL_ONLINE:
- release_region_mutex(true);
+ mutex_unlock(&dm_device.ha_region_mutex);
if (dm_device.ha_waiting) {
dm_device.ha_waiting = false;
complete(&dm_device.ol_waitevent);
}
break;
- case MEM_GOING_OFFLINE:
case MEM_OFFLINE:
+ mutex_lock(&dm_device.ha_region_mutex);
+ dm_device.num_pages_onlined -= mem->nr_pages;
+ mutex_unlock(&dm_device.ha_region_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
case MEM_CANCEL_OFFLINE:
break;
}
@@ -646,7 +633,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
init_completion(&dm_device.ol_waitevent);
dm_device.ha_waiting = true;
- release_region_mutex(false);
+ mutex_unlock(&dm_device.ha_region_mutex);
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
(HA_CHUNK << PAGE_SHIFT));
@@ -675,7 +662,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
* have not been "onlined" within the allowed time.
*/
wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ);
- acquire_region_mutex(false);
+ mutex_lock(&dm_device.ha_region_mutex);
post_status(&dm_device);
}
@@ -886,7 +873,7 @@ static void hot_add_req(struct work_struct *dummy)
resp.hdr.size = sizeof(struct dm_hot_add_response);
#ifdef CONFIG_MEMORY_HOTPLUG
- acquire_region_mutex(false);
+ mutex_lock(&dm_device.ha_region_mutex);
pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
@@ -918,7 +905,9 @@ static void hot_add_req(struct work_struct *dummy)
if (do_hot_add)
resp.page_count = process_hot_add(pg_start, pfn_cnt,
rg_start, rg_sz);
- release_region_mutex(false);
+
+ dm->num_pages_added += resp.page_count;
+ mutex_unlock(&dm_device.ha_region_mutex);
#endif
/*
* The result field of the response structure has the
@@ -1031,17 +1020,21 @@ static void post_status(struct hv_dynmem_device *dm)
status.hdr.trans_id = atomic_inc_return(&trans_id);
/*
- * The host expects the guest to report free memory.
- * Further, the host expects the pressure information to
- * include the ballooned out pages.
- * For a given amount of memory that we are managing, we
- * need to compute a floor below which we should not balloon.
- * Compute this and add it to the pressure report.
+ * The host expects the guest to report free and committed memory.
+ * Furthermore, the host expects the pressure information to include
+ * the ballooned out pages. For a given amount of memory that we are
+ * managing we need to compute a floor below which we should not
+ * balloon. Compute this and add it to the pressure report.
+ * We also need to report all offline pages (num_pages_added -
+ * num_pages_onlined) as committed to the host, otherwise it can try
+ * asking us to balloon them out.
*/
status.num_avail = val.freeram;
status.num_committed = vm_memory_committed() +
- dm->num_pages_ballooned +
- compute_balloon_floor();
+ dm->num_pages_ballooned +
+ (dm->num_pages_added > dm->num_pages_onlined ?
+ dm->num_pages_added - dm->num_pages_onlined : 0) +
+ compute_balloon_floor();
/*
* If our transaction ID is no longer current, just don't
@@ -1145,6 +1138,8 @@ static void balloon_up(struct work_struct *dummy)
bool alloc_error;
bool done = false;
int i;
+ struct sysinfo val;
+ unsigned long floor;
/* The host balloons pages in 2M granularity. */
WARN_ON_ONCE(num_pages % PAGES_IN_2M != 0);
@@ -1155,6 +1150,15 @@ static void balloon_up(struct work_struct *dummy)
*/
alloc_unit = 512;
+ si_meminfo(&val);
+ floor = compute_balloon_floor();
+
+ /* Refuse to balloon below the floor, keep the 2M granularity. */
+ if (val.freeram - num_pages < floor) {
+ num_pages = val.freeram > floor ? (val.freeram - floor) : 0;
+ num_pages -= num_pages % PAGES_IN_2M;
+ }
+
while (!done) {
bl_resp = (struct dm_balloon_response *)send_buffer;
memset(send_buffer, 0, PAGE_SIZE);
@@ -1414,7 +1418,8 @@ static void balloon_onchannelcallback(void *context)
static int balloon_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
- int ret, t;
+ int ret;
+ unsigned long t;
struct dm_version_request version_req;
struct dm_capabilities cap_msg;
@@ -1439,7 +1444,6 @@ static int balloon_probe(struct hv_device *dev,
dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
init_completion(&dm_device.host_event);
init_completion(&dm_device.config_event);
- init_completion(&dm_device.waiter_event);
INIT_LIST_HEAD(&dm_device.ha_region_list);
mutex_init(&dm_device.ha_region_mutex);
INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 3b9c9ef0deb8..7994ec2e4151 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -340,12 +340,8 @@ static int util_probe(struct hv_device *dev,
set_channel_read_state(dev->channel, false);
- ret = vmbus_open(dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0,
- srv->util_cb, dev->channel);
- if (ret)
- goto error;
-
hv_set_drvdata(dev, srv);
+
/*
* Based on the host; initialize the framework and
* service version numbers we will negotiate.
@@ -365,6 +361,11 @@ static int util_probe(struct hv_device *dev,
hb_srv_version = HB_VERSION;
}
+ ret = vmbus_open(dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0,
+ srv->util_cb, dev->channel);
+ if (ret)
+ goto error;
+
return 0;
error:
@@ -379,9 +380,9 @@ static int util_remove(struct hv_device *dev)
{
struct hv_util_service *srv = hv_get_drvdata(dev);
- vmbus_close(dev->channel);
if (srv->util_deinit)
srv->util_deinit();
+ vmbus_close(dev->channel);
kfree(srv->recv_buffer);
return 0;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 44b1c9424712..88af4ec559c4 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -49,6 +49,17 @@ enum hv_cpuid_function {
HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005,
};
+#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE 0x400
+
+#define HV_X64_MSR_CRASH_P0 0x40000100
+#define HV_X64_MSR_CRASH_P1 0x40000101
+#define HV_X64_MSR_CRASH_P2 0x40000102
+#define HV_X64_MSR_CRASH_P3 0x40000103
+#define HV_X64_MSR_CRASH_P4 0x40000104
+#define HV_X64_MSR_CRASH_CTL 0x40000105
+
+#define HV_CRASH_CTL_CRASH_NOTIFY 0x8000000000000000
+
/* Define version of the synthetic interrupt controller. */
#define HV_SYNIC_VERSION (1)
@@ -572,6 +583,8 @@ extern void hv_synic_init(void *irqarg);
extern void hv_synic_cleanup(void *arg);
+extern void hv_synic_clockevents_cleanup(void);
+
/*
* Host version information.
*/
@@ -692,6 +705,7 @@ void vmbus_free_channels(void);
/* Connection interface */
int vmbus_connect(void);
+void vmbus_disconnect(void);
int vmbus_post_msg(void *buffer, size_t buflen);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index f518b8d7a5b5..8313e25378cb 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -33,9 +33,12 @@
#include <linux/hyperv.h>
#include <linux/kernel_stat.h>
#include <linux/clockchips.h>
+#include <linux/cpu.h>
#include <asm/hyperv.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
+#include <linux/notifier.h>
+#include <linux/ptrace.h>
#include "hyperv_vmbus.h"
static struct acpi_device *hv_acpi_dev;
@@ -44,6 +47,31 @@ static struct tasklet_struct msg_dpc;
static struct completion probe_event;
static int irq;
+
+static int hyperv_panic_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct pt_regs *regs;
+
+ regs = current_pt_regs();
+
+ wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip);
+ wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax);
+ wrmsrl(HV_X64_MSR_CRASH_P2, regs->bx);
+ wrmsrl(HV_X64_MSR_CRASH_P3, regs->cx);
+ wrmsrl(HV_X64_MSR_CRASH_P4, regs->dx);
+
+ /*
+ * Let Hyper-V know there is crash data available
+ */
+ wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block hyperv_panic_block = {
+ .notifier_call = hyperv_panic_event,
+};
+
struct resource hyperv_mmio = {
.name = "hyperv mmio",
.flags = IORESOURCE_MEM,
@@ -507,14 +535,26 @@ static int vmbus_probe(struct device *child_device)
*/
static int vmbus_remove(struct device *child_device)
{
- struct hv_driver *drv = drv_to_hv_drv(child_device->driver);
+ struct hv_driver *drv;
struct hv_device *dev = device_to_hv_device(child_device);
-
- if (drv->remove)
- drv->remove(dev);
- else
- pr_err("remove not set for driver %s\n",
- dev_name(child_device));
+ u32 relid = dev->channel->offermsg.child_relid;
+
+ if (child_device->driver) {
+ drv = drv_to_hv_drv(child_device->driver);
+ if (drv->remove)
+ drv->remove(dev);
+ else {
+ hv_process_channel_removal(dev->channel, relid);
+ pr_err("remove not set for driver %s\n",
+ dev_name(child_device));
+ }
+ } else {
+ /*
+ * We don't have a driver for this device; deal with the
+ * rescind message by removing the channel.
+ */
+ hv_process_channel_removal(dev->channel, relid);
+ }
return 0;
}
@@ -573,6 +613,10 @@ static void vmbus_onmessage_work(struct work_struct *work)
{
struct onmessage_work_context *ctx;
+ /* Do not process messages if we're in DISCONNECTED state */
+ if (vmbus_connection.conn_state == DISCONNECTED)
+ return;
+
ctx = container_of(work, struct onmessage_work_context,
work);
vmbus_onmessage(&ctx->msg);
@@ -704,6 +748,39 @@ static void vmbus_isr(void)
}
}
+#ifdef CONFIG_HOTPLUG_CPU
+static int hyperv_cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+static void hv_cpu_hotplug_quirk(bool vmbus_loaded)
+{
+ static void *previous_cpu_disable;
+
+ /*
+ * Offlining a CPU when running on newer hypervisors (WS2012R2, Win8,
+ * ...) is not supported at this moment as channel interrupts are
+ * distributed across all of them.
+ */
+
+ if ((vmbus_proto_version == VERSION_WS2008) ||
+ (vmbus_proto_version == VERSION_WIN7))
+ return;
+
+ if (vmbus_loaded) {
+ previous_cpu_disable = smp_ops.cpu_disable;
+ smp_ops.cpu_disable = hyperv_cpu_disable;
+ pr_notice("CPU offlining is not supported by hypervisor\n");
+ } else if (previous_cpu_disable)
+ smp_ops.cpu_disable = previous_cpu_disable;
+}
+#else
+static void hv_cpu_hotplug_quirk(bool vmbus_loaded)
+{
+}
+#endif
+
/*
* vmbus_bus_init -Main vmbus driver initialization routine.
*
@@ -744,6 +821,16 @@ static int vmbus_bus_init(int irq)
if (ret)
goto err_alloc;
+ hv_cpu_hotplug_quirk(true);
+
+ /*
+ * Only register if the crash MSRs are available
+ */
+ if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &hyperv_panic_block);
+ }
+
vmbus_request_offers();
return 0;
@@ -840,10 +927,8 @@ int vmbus_device_register(struct hv_device *child_device_obj)
{
int ret = 0;
- static atomic_t device_num = ATOMIC_INIT(0);
-
- dev_set_name(&child_device_obj->device, "vmbus_0_%d",
- atomic_inc_return(&device_num));
+ dev_set_name(&child_device_obj->device, "vmbus_%d",
+ child_device_obj->channel->id);
child_device_obj->device.bus = &hv_bus;
child_device_obj->device.parent = &hv_acpi_dev->dev;
@@ -992,11 +1077,19 @@ cleanup:
static void __exit vmbus_exit(void)
{
+ int cpu;
+
+ vmbus_connection.conn_state = DISCONNECTED;
+ hv_synic_clockevents_cleanup();
hv_remove_vmbus_irq();
vmbus_free_channels();
bus_unregister(&hv_bus);
hv_cleanup();
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1);
acpi_bus_unregister_driver(&vmbus_acpi_driver);
+ hv_cpu_hotplug_quirk(false);
+ vmbus_disconnect();
}