diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-16 21:45:23 +0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-16 21:45:23 +0400 |
commit | 02b2318e07f98a7cdf7089a4457a8d62424aa824 (patch) | |
tree | b40353a9ee6b034e21192ceb5df445fbc5fbdd32 | |
parent | b91cba52e9b7b3f1c0037908a192d93a869ca9e5 (diff) | |
parent | d54bc2793ec3405c6b8f217568a82b87bd8a591b (diff) | |
download | linux-02b2318e07f98a7cdf7089a4457a8d62424aa824.tar.xz |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6
* 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6: (26 commits)
[SPARC64]: Fix UP build.
[SPARC64]: dr-cpu unconfigure support.
[SERIAL]: Fix console write locking in sparc drivers.
[SPARC64]: Give more accurate errors in dr_cpu_configure().
[SPARC64]: Clear cpu_{core,sibling}_map[] in smp_fill_in_sib_core_maps()
[SPARC64]: Fix leak when DR added cpu does not bootup.
[SPARC64]: Add ->set_affinity IRQ handlers.
[SPARC64]: Process dr-cpu events in a kthread instead of workqueue.
[SPARC64]: More sensible udelay implementation.
[SPARC64]: SMP build fixes.
[SPARC64]: mdesc.c needs linux/mm.h
[SPARC64]: Fix build regressions added by dr-cpu changes.
[SPARC64]: Unconditionally register vio_bus_type.
[SPARC64]: Initial LDOM cpu hotplug support.
[SPARC64]: Fix setting of variables in LDOM guest.
[SPARC64]: Fix MD property lifetime bugs.
[SPARC64]: Abstract out mdesc accesses for better MD update handling.
[SPARC64]: Use more mearningful names for IRQ registry.
[SPARC64]: Initial domain-services driver.
[SPARC64]: Export powerd facilities for external entities.
...
45 files changed, 8609 insertions, 537 deletions
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 6566d13db04f..b84b6af1241e 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -108,6 +108,15 @@ config SECCOMP source kernel/Kconfig.hz +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs" + depends on SMP + select HOTPLUG + ---help--- + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu/cpu#. + Say N if you want to disable CPU hotplug. + source "init/Kconfig" config SYSVIPC_COMPAT @@ -305,6 +314,12 @@ config SUN_IO bool default y +config SUN_LDOMS + bool "Sun Logical Domains support" + help + Say Y here is you want to support virtual devices via + Logical Domains. + config PCI bool "PCI support" select ARCH_SUPPORTS_MSI diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile index f964bf28d21a..b66876bf410c 100644 --- a/arch/sparc64/kernel/Makefile +++ b/arch/sparc64/kernel/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_PCI) += ebus.o isa.o pci_common.o pci_iommu.o \ pci_psycho.o pci_sabre.o pci_schizo.o \ pci_sun4v.o pci_sun4v_asm.o pci_fire.o -obj-$(CONFIG_SMP) += smp.o trampoline.o +obj-$(CONFIG_SMP) += smp.o trampoline.o hvtramp.o obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o obj-$(CONFIG_BINFMT_AOUT32) += binfmt_aout32.o @@ -26,6 +26,7 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_US3_FREQ) += us3_cpufreq.o obj-$(CONFIG_US2E_FREQ) += us2e_cpufreq.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_SUN_LDOMS) += ldc.o vio.o viohs.o ds.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDIT)$(CONFIG_SPARC32_COMPAT) += compat_audit.o obj-y += $(obj-yy) diff --git a/arch/sparc64/kernel/ds.c b/arch/sparc64/kernel/ds.c new file mode 100644 index 000000000000..1c587107cef0 --- /dev/null +++ b/arch/sparc64/kernel/ds.c @@ -0,0 +1,1158 @@ +/* ds.c: Domain Services driver for Logical Domains + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/mutex.h> +#include <linux/kthread.h> +#include <linux/cpu.h> + +#include <asm/ldc.h> +#include <asm/vio.h> +#include <asm/power.h> +#include <asm/mdesc.h> +#include <asm/head.h> +#include <asm/irq.h> + +#define DRV_MODULE_NAME "ds" +#define PFX DRV_MODULE_NAME ": " +#define DRV_MODULE_VERSION "1.0" +#define DRV_MODULE_RELDATE "Jul 11, 2007" + +static char version[] __devinitdata = + DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; +MODULE_AUTHOR("David S. Miller (davem@davemloft.net)"); +MODULE_DESCRIPTION("Sun LDOM domain services driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_MODULE_VERSION); + +struct ds_msg_tag { + __u32 type; +#define DS_INIT_REQ 0x00 +#define DS_INIT_ACK 0x01 +#define DS_INIT_NACK 0x02 +#define DS_REG_REQ 0x03 +#define DS_REG_ACK 0x04 +#define DS_REG_NACK 0x05 +#define DS_UNREG_REQ 0x06 +#define DS_UNREG_ACK 0x07 +#define DS_UNREG_NACK 0x08 +#define DS_DATA 0x09 +#define DS_NACK 0x0a + + __u32 len; +}; + +/* Result codes */ +#define DS_OK 0x00 +#define DS_REG_VER_NACK 0x01 +#define DS_REG_DUP 0x02 +#define DS_INV_HDL 0x03 +#define DS_TYPE_UNKNOWN 0x04 + +struct ds_version { + __u16 major; + __u16 minor; +}; + +struct ds_ver_req { + struct ds_msg_tag tag; + struct ds_version ver; +}; + +struct ds_ver_ack { + struct ds_msg_tag tag; + __u16 minor; +}; + +struct ds_ver_nack { + struct ds_msg_tag tag; + __u16 major; +}; + +struct ds_reg_req { + struct ds_msg_tag tag; + __u64 handle; + __u16 major; + __u16 minor; + char svc_id[0]; +}; + +struct ds_reg_ack { + struct ds_msg_tag tag; + __u64 handle; + __u16 minor; +}; + +struct ds_reg_nack { + struct ds_msg_tag tag; + __u64 handle; + __u16 major; +}; + +struct ds_unreg_req { + struct ds_msg_tag tag; + __u64 handle; +}; + +struct ds_unreg_ack { + struct ds_msg_tag tag; + __u64 handle; +}; + +struct ds_unreg_nack { + struct ds_msg_tag tag; + __u64 handle; +}; + +struct ds_data { + struct ds_msg_tag tag; + __u64 handle; +}; + +struct ds_data_nack { + struct ds_msg_tag tag; + __u64 handle; + __u64 result; +}; + +struct ds_cap_state { + __u64 handle; + + void (*data)(struct ldc_channel *lp, + struct ds_cap_state *cp, + void *buf, int len); + + const char *service_id; + + u8 state; +#define CAP_STATE_UNKNOWN 0x00 +#define CAP_STATE_REG_SENT 0x01 +#define CAP_STATE_REGISTERED 0x02 +}; + +static void md_update_data(struct ldc_channel *lp, struct ds_cap_state *cp, + void *buf, int len); +static void domain_shutdown_data(struct ldc_channel *lp, + struct ds_cap_state *cp, + void *buf, int len); +static void domain_panic_data(struct ldc_channel *lp, + struct ds_cap_state *cp, + void *buf, int len); +#ifdef CONFIG_HOTPLUG_CPU +static void dr_cpu_data(struct ldc_channel *lp, + struct ds_cap_state *cp, + void *buf, int len); +#endif +static void ds_pri_data(struct ldc_channel *lp, + struct ds_cap_state *cp, + void *buf, int len); +static void ds_var_data(struct ldc_channel *lp, + struct ds_cap_state *cp, + void *buf, int len); + +struct ds_cap_state ds_states[] = { + { + .service_id = "md-update", + .data = md_update_data, + }, + { + .service_id = "domain-shutdown", + .data = domain_shutdown_data, + }, + { + .service_id = "domain-panic", + .data = domain_panic_data, + }, +#ifdef CONFIG_HOTPLUG_CPU + { + .service_id = "dr-cpu", + .data = dr_cpu_data, + }, +#endif + { + .service_id = "pri", + .data = ds_pri_data, + }, + { + .service_id = "var-config", + .data = ds_var_data, + }, + { + .service_id = "var-config-backup", + .data = ds_var_data, + }, +}; + +static DEFINE_SPINLOCK(ds_lock); + +struct ds_info { + struct ldc_channel *lp; + u8 hs_state; +#define DS_HS_START 0x01 +#define DS_HS_DONE 0x02 + + void *rcv_buf; + int rcv_buf_len; +}; + +static struct ds_info *ds_info; + +static struct ds_cap_state *find_cap(u64 handle) +{ + unsigned int index = handle >> 32; + + if (index >= ARRAY_SIZE(ds_states)) + return NULL; + return &ds_states[index]; +} + +static struct ds_cap_state *find_cap_by_string(const char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ds_states); i++) { + if (strcmp(ds_states[i].service_id, name)) + continue; + + return &ds_states[i]; + } + return NULL; +} + +static int ds_send(struct ldc_channel *lp, void *data, int len) +{ + int err, limit = 1000; + + err = -EINVAL; + while (limit-- > 0) { + err = ldc_write(lp, data, len); + if (!err || (err != -EAGAIN)) + break; + udelay(1); + } + + return err; +} + +struct ds_md_update_req { + __u64 req_num; +}; + +struct ds_md_update_res { + __u64 req_num; + __u32 result; +}; + +static void md_update_data(struct ldc_channel *lp, + struct ds_cap_state *dp, + void *buf, int len) +{ + struct ds_data *dpkt = buf; + struct ds_md_update_req *rp; + struct { + struct ds_data data; + struct ds_md_update_res res; + } pkt; + + rp = (struct ds_md_update_req *) (dpkt + 1); + + printk(KERN_INFO PFX "Machine description update.\n"); + + memset(&pkt, 0, sizeof(pkt)); + pkt.data.tag.type = DS_DATA; + pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag); + pkt.data.handle = dp->handle; + pkt.res.req_num = rp->req_num; + pkt.res.result = DS_OK; + + ds_send(lp, &pkt, sizeof(pkt)); + + mdesc_update(); +} + +struct ds_shutdown_req { + __u64 req_num; + __u32 ms_delay; +}; + +struct ds_shutdown_res { + __u64 req_num; + __u32 result; + char reason[1]; +}; + +static void domain_shutdown_data(struct ldc_channel *lp, + struct ds_cap_state *dp, + void *buf, int len) +{ + struct ds_data *dpkt = buf; + struct ds_shutdown_req *rp; + struct { + struct ds_data data; + struct ds_shutdown_res res; + } pkt; + + rp = (struct ds_shutdown_req *) (dpkt + 1); + + printk(KERN_ALERT PFX "Shutdown request from " + "LDOM manager received.\n"); + + memset(&pkt, 0, sizeof(pkt)); + pkt.data.tag.type = DS_DATA; + pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag); + pkt.data.handle = dp->handle; + pkt.res.req_num = rp->req_num; + pkt.res.result = DS_OK; + pkt.res.reason[0] = 0; + + ds_send(lp, &pkt, sizeof(pkt)); + + wake_up_powerd(); +} + +struct ds_panic_req { + __u64 req_num; +}; + +struct ds_panic_res { + __u64 req_num; + __u32 result; + char reason[1]; +}; + +static void domain_panic_data(struct ldc_channel *lp, + struct ds_cap_state *dp, + void *buf, int len) +{ + struct ds_data *dpkt = buf; + struct ds_panic_req *rp; + struct { + struct ds_data data; + struct ds_panic_res res; + } pkt; + + rp = (struct ds_panic_req *) (dpkt + 1); + + printk(KERN_ALERT PFX "Panic request from " + "LDOM manager received.\n"); + + memset(&pkt, 0, sizeof(pkt)); + pkt.data.tag.type = DS_DATA; + pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag); + pkt.data.handle = dp->handle; + pkt.res.req_num = rp->req_num; + pkt.res.result = DS_OK; + pkt.res.reason[0] = 0; + + ds_send(lp, &pkt, sizeof(pkt)); + + panic("PANIC requested by LDOM manager."); +} + +#ifdef CONFIG_HOTPLUG_CPU +struct dr_cpu_tag { + __u64 req_num; + __u32 type; +#define DR_CPU_CONFIGURE 0x43 +#define DR_CPU_UNCONFIGURE 0x55 +#define DR_CPU_FORCE_UNCONFIGURE 0x46 +#define DR_CPU_STATUS 0x53 + +/* Responses */ +#define DR_CPU_OK 0x6f +#define DR_CPU_ERROR 0x65 + + __u32 num_records; +}; + +struct dr_cpu_resp_entry { + __u32 cpu; + __u32 result; +#define DR_CPU_RES_OK 0x00 +#define DR_CPU_RES_FAILURE 0x01 +#define DR_CPU_RES_BLOCKED 0x02 +#define DR_CPU_RES_CPU_NOT_RESPONDING 0x03 +#define DR_CPU_RES_NOT_IN_MD 0x04 + + __u32 stat; +#define DR_CPU_STAT_NOT_PRESENT 0x00 +#define DR_CPU_STAT_UNCONFIGURED 0x01 +#define DR_CPU_STAT_CONFIGURED 0x02 + + __u32 str_off; +}; + +/* DR cpu requests get queued onto the work list by the + * dr_cpu_data() callback. The list is protected by + * ds_lock, and processed by dr_cpu_process() in order. + */ +static LIST_HEAD(dr_cpu_work_list); +static DECLARE_WAIT_QUEUE_HEAD(dr_cpu_wait); + +struct dr_cpu_queue_entry { + struct list_head list; + char req[0]; +}; + +static void __dr_cpu_send_error(struct ds_cap_state *cp, struct ds_data *data) +{ + struct dr_cpu_tag *tag = (struct dr_cpu_tag *) (data + 1); + struct ds_info *dp = ds_info; + struct { + struct ds_data data; + struct dr_cpu_tag tag; + } pkt; + int msg_len; + + memset(&pkt, 0, sizeof(pkt)); + pkt.data.tag.type = DS_DATA; + pkt.data.handle = cp->handle; + pkt.tag.req_num = tag->req_num; + pkt.tag.type = DR_CPU_ERROR; + pkt.tag.num_records = 0; + + msg_len = (sizeof(struct ds_data) + + sizeof(struct dr_cpu_tag)); + + pkt.data.tag.len = msg_len - sizeof(struct ds_msg_tag); + + ds_send(dp->lp, &pkt, msg_len); +} + +static void dr_cpu_send_error(struct ds_cap_state *cp, struct ds_data *data) +{ + unsigned long flags; + + spin_lock_irqsave(&ds_lock, flags); + __dr_cpu_send_error(cp, data); + spin_unlock_irqrestore(&ds_lock, flags); +} + +#define CPU_SENTINEL 0xffffffff + +static void purge_dups(u32 *list, u32 num_ents) +{ + unsigned int i; + + for (i = 0; i < num_ents; i++) { + u32 cpu = list[i]; + unsigned int j; + + if (cpu == CPU_SENTINEL) + continue; + + for (j = i + 1; j < num_ents; j++) { + if (list[j] == cpu) + list[j] = CPU_SENTINEL; + } + } +} + +static int dr_cpu_size_response(int ncpus) +{ + return (sizeof(struct ds_data) + + sizeof(struct dr_cpu_tag) + + (sizeof(struct dr_cpu_resp_entry) * ncpus)); +} + +static void dr_cpu_init_response(struct ds_data *resp, u64 req_num, + u64 handle, int resp_len, int ncpus, + cpumask_t *mask, u32 default_stat) +{ + struct dr_cpu_resp_entry *ent; + struct dr_cpu_tag *tag; + int i, cpu; + + tag = (struct dr_cpu_tag *) (resp + 1); + ent = (struct dr_cpu_resp_entry *) (tag + 1); + + resp->tag.type = DS_DATA; + resp->tag.len = resp_len - sizeof(struct ds_msg_tag); + resp->handle = handle; + tag->req_num = req_num; + tag->type = DR_CPU_OK; + tag->num_records = ncpus; + + i = 0; + for_each_cpu_mask(cpu, *mask) { + ent[i].cpu = cpu; + ent[i].result = DR_CPU_RES_OK; + ent[i].stat = default_stat; + i++; + } + BUG_ON(i != ncpus); +} + +static void dr_cpu_mark(struct ds_data *resp, int cpu, int ncpus, + u32 res, u32 stat) +{ + struct dr_cpu_resp_entry *ent; + struct dr_cpu_tag *tag; + int i; + + tag = (struct dr_cpu_tag *) (resp + 1); + ent = (struct dr_cpu_resp_entry *) (tag + 1); + + for (i = 0; i < ncpus; i++) { + if (ent[i].cpu != cpu) + continue; + ent[i].result = res; + ent[i].stat = stat; + break; + } +} + +static int dr_cpu_configure(struct ds_cap_state *cp, u64 req_num, + cpumask_t *mask) +{ + struct ds_data *resp; + int resp_len, ncpus, cpu; + unsigned long flags; + + ncpus = cpus_weight(*mask); + resp_len = dr_cpu_size_response(ncpus); + resp = kzalloc(resp_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + dr_cpu_init_response(resp, req_num, cp->handle, + resp_len, ncpus, mask, + DR_CPU_STAT_CONFIGURED); + + mdesc_fill_in_cpu_data(*mask); + + for_each_cpu_mask(cpu, *mask) { + int err; + + printk(KERN_INFO PFX "Starting cpu %d...\n", cpu); + err = cpu_up(cpu); + if (err) { + __u32 res = DR_CPU_RES_FAILURE; + __u32 stat = DR_CPU_STAT_UNCONFIGURED; + + if (!cpu_present(cpu)) { + /* CPU not present in MD */ + res = DR_CPU_RES_NOT_IN_MD; + stat = DR_CPU_STAT_NOT_PRESENT; + } else if (err == -ENODEV) { + /* CPU did not call in successfully */ + res = DR_CPU_RES_CPU_NOT_RESPONDING; + } + + printk(KERN_INFO PFX "CPU startup failed err=%d\n", + err); + dr_cpu_mark(resp, cpu, ncpus, res, stat); + } + } + + spin_lock_irqsave(&ds_lock, flags); + ds_send(ds_info->lp, resp, resp_len); + spin_unlock_irqrestore(&ds_lock, flags); + + kfree(resp); + + /* Redistribute IRQs, taking into account the new cpus. */ + fixup_irqs(); + + return 0; +} + +static int dr_cpu_unconfigure(struct ds_cap_state *cp, u64 req_num, + cpumask_t *mask) +{ + struct ds_data *resp; + int resp_len, ncpus, cpu; + unsigned long flags; + + ncpus = cpus_weight(*mask); + resp_len = dr_cpu_size_response(ncpus); + resp = kzalloc(resp_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + dr_cpu_init_response(resp, req_num, cp->handle, + resp_len, ncpus, mask, + DR_CPU_STAT_UNCONFIGURED); + + for_each_cpu_mask(cpu, *mask) { + int err; + + printk(KERN_INFO PFX "CPU[%d]: Shutting down cpu %d...\n", + smp_processor_id(), cpu); + err = cpu_down(cpu); + if (err) + dr_cpu_mark(resp, cpu, ncpus, + DR_CPU_RES_FAILURE, + DR_CPU_STAT_CONFIGURED); + } + + spin_lock_irqsave(&ds_lock, flags); + ds_send(ds_info->lp, resp, resp_len); + spin_unlock_irqrestore(&ds_lock, flags); + + kfree(resp); + + return 0; +} + +static void process_dr_cpu_list(struct ds_cap_state *cp) +{ + struct dr_cpu_queue_entry *qp, *tmp; + unsigned long flags; + LIST_HEAD(todo); + cpumask_t mask; + + spin_lock_irqsave(&ds_lock, flags); + list_splice(&dr_cpu_work_list, &todo); + INIT_LIST_HEAD(&dr_cpu_work_list); + spin_unlock_irqrestore(&ds_lock, flags); + + list_for_each_entry_safe(qp, tmp, &todo, list) { + struct ds_data *data = (struct ds_data *) qp->req; + struct dr_cpu_tag *tag = (struct dr_cpu_tag *) (data + 1); + u32 *cpu_list = (u32 *) (tag + 1); + u64 req_num = tag->req_num; + unsigned int i; + int err; + + switch (tag->type) { + case DR_CPU_CONFIGURE: + case DR_CPU_UNCONFIGURE: + case DR_CPU_FORCE_UNCONFIGURE: + break; + + default: + dr_cpu_send_error(cp, data); + goto next; + } + + purge_dups(cpu_list, tag->num_records); + + cpus_clear(mask); + for (i = 0; i < tag->num_records; i++) { + if (cpu_list[i] == CPU_SENTINEL) + continue; + + if (cpu_list[i] < NR_CPUS) + cpu_set(cpu_list[i], mask); + } + + if (tag->type == DR_CPU_CONFIGURE) + err = dr_cpu_configure(cp, req_num, &mask); + else + err = dr_cpu_unconfigure(cp, req_num, &mask); + + if (err) + dr_cpu_send_error(cp, data); + +next: + list_del(&qp->list); + kfree(qp); + } +} + +static int dr_cpu_thread(void *__unused) +{ + struct ds_cap_state *cp; + DEFINE_WAIT(wait); + + cp = find_cap_by_string("dr-cpu"); + + while (1) { + prepare_to_wait(&dr_cpu_wait, &wait, TASK_INTERRUPTIBLE); + if (list_empty(&dr_cpu_work_list)) + schedule(); + finish_wait(&dr_cpu_wait, &wait); + + if (kthread_should_stop()) + break; + + process_dr_cpu_list(cp); + } + + return 0; +} + +static void dr_cpu_data(struct ldc_channel *lp, + struct ds_cap_state *dp, + void *buf, int len) +{ + struct dr_cpu_queue_entry *qp; + struct ds_data *dpkt = buf; + struct dr_cpu_tag *rp; + + rp = (struct dr_cpu_tag *) (dpkt + 1); + + qp = kmalloc(sizeof(struct dr_cpu_queue_entry) + len, GFP_ATOMIC); + if (!qp) { + struct ds_cap_state *cp; + + cp = find_cap_by_string("dr-cpu"); + __dr_cpu_send_error(cp, dpkt); + } else { + memcpy(&qp->req, buf, len); + list_add_tail(&qp->list, &dr_cpu_work_list); + wake_up(&dr_cpu_wait); + } +} +#endif + +struct ds_pri_msg { + __u64 req_num; + __u64 type; +#define DS_PRI_REQUEST 0x00 +#define DS_PRI_DATA 0x01 +#define DS_PRI_UPDATE 0x02 +}; + +static void ds_pri_data(struct ldc_channel *lp, + struct ds_cap_state *dp, + void *buf, int len) +{ + struct ds_data *dpkt = buf; + struct ds_pri_msg *rp; + + rp = (struct ds_pri_msg *) (dpkt + 1); + + printk(KERN_INFO PFX "PRI REQ [%lx:%lx], len=%d\n", + rp->req_num, rp->type, len); +} + +struct ds_var_hdr { + __u32 type; +#define DS_VAR_SET_REQ 0x00 +#define DS_VAR_DELETE_REQ 0x01 +#define DS_VAR_SET_RESP 0x02 +#define DS_VAR_DELETE_RESP 0x03 +}; + +struct ds_var_set_msg { + struct ds_var_hdr hdr; + char name_and_value[0]; +}; + +struct ds_var_delete_msg { + struct ds_var_hdr hdr; + char name[0]; +}; + +struct ds_var_resp { + struct ds_var_hdr hdr; + __u32 result; +#define DS_VAR_SUCCESS 0x00 +#define DS_VAR_NO_SPACE 0x01 +#define DS_VAR_INVALID_VAR 0x02 +#define DS_VAR_INVALID_VAL 0x03 +#define DS_VAR_NOT_PRESENT 0x04 +}; + +static DEFINE_MUTEX(ds_var_mutex); +static int ds_var_doorbell; +static int ds_var_response; + +static void ds_var_data(struct ldc_channel *lp, + struct ds_cap_state *dp, + void *buf, int len) +{ + struct ds_data *dpkt = buf; + struct ds_var_resp *rp; + + rp = (struct ds_var_resp *) (dpkt + 1); + + if (rp->hdr.type != DS_VAR_SET_RESP && + rp->hdr.type != DS_VAR_DELETE_RESP) + return; + + ds_var_response = rp->result; + wmb(); + ds_var_doorbell = 1; +} + +void ldom_set_var(const char *var, const char *value) +{ + struct ds_info *dp = ds_info; + struct ds_cap_state *cp; + + cp = find_cap_by_string("var-config"); + if (cp->state != CAP_STATE_REGISTERED) + cp = find_cap_by_string("var-config-backup"); + + if (cp->state == CAP_STATE_REGISTERED) { + union { + struct { + struct ds_data data; + struct ds_var_set_msg msg; + } header; + char all[512]; + } pkt; + unsigned long flags; + char *base, *p; + int msg_len, loops; + + memset(&pkt, 0, sizeof(pkt)); + pkt.header.data.tag.type = DS_DATA; + pkt.header.data.handle = cp->handle; + pkt.header.msg.hdr.type = DS_VAR_SET_REQ; + base = p = &pkt.header.msg.name_and_value[0]; + strcpy(p, var); + p += strlen(var) + 1; + strcpy(p, value); + p += strlen(value) + 1; + + msg_len = (sizeof(struct ds_data) + + sizeof(struct ds_var_set_msg) + + (p - base)); + msg_len = (msg_len + 3) & ~3; + pkt.header.data.tag.len = msg_len - sizeof(struct ds_msg_tag); + + mutex_lock(&ds_var_mutex); + + spin_lock_irqsave(&ds_lock, flags); + ds_var_doorbell = 0; + ds_var_response = -1; + + ds_send(dp->lp, &pkt, msg_len); + spin_unlock_irqrestore(&ds_lock, flags); + + loops = 1000; + while (ds_var_doorbell == 0) { + if (loops-- < 0) + break; + barrier(); + udelay(100); + } + + mutex_unlock(&ds_var_mutex); + + if (ds_var_doorbell == 0 || + ds_var_response != DS_VAR_SUCCESS) + printk(KERN_ERR PFX "var-config [%s:%s] " + "failed, response(%d).\n", + var, value, + ds_var_response); + } else { + printk(KERN_ERR PFX "var-config not registered so " + "could not set (%s) variable to (%s).\n", + var, value); + } +} + +void ldom_reboot(const char *boot_command) +{ + /* Don't bother with any of this if the boot_command + * is empty. + */ + if (boot_command && strlen(boot_command)) { + char full_boot_str[256]; + + strcpy(full_boot_str, "boot "); + strcpy(full_boot_str + strlen("boot "), boot_command); + + ldom_set_var("reboot-command", full_boot_str); + } + sun4v_mach_sir(); +} + +void ldom_power_off(void) +{ + sun4v_mach_exit(0); +} + +static void ds_conn_reset(struct ds_info *dp) +{ + printk(KERN_ERR PFX "ds_conn_reset() from %p\n", + __builtin_return_address(0)); +} + +static int register_services(struct ds_info *dp) +{ + struct ldc_channel *lp = dp->lp; + int i; + + for (i = 0; i < ARRAY_SIZE(ds_states); i++) { + struct { + struct ds_reg_req req; + u8 id_buf[256]; + } pbuf; + struct ds_cap_state *cp = &ds_states[i]; + int err, msg_len; + u64 new_count; + + if (cp->state == CAP_STATE_REGISTERED) + continue; + + new_count = sched_clock() & 0xffffffff; + cp->handle = ((u64) i << 32) | new_count; + + msg_len = (sizeof(struct ds_reg_req) + + strlen(cp->service_id)); + + memset(&pbuf, 0, sizeof(pbuf)); + pbuf.req.tag.type = DS_REG_REQ; + pbuf.req.tag.len = (msg_len - sizeof(struct ds_msg_tag)); + pbuf.req.handle = cp->handle; + pbuf.req.major = 1; + pbuf.req.minor = 0; + strcpy(pbuf.req.svc_id, cp->service_id); + + err = ds_send(lp, &pbuf, msg_len); + if (err > 0) + cp->state = CAP_STATE_REG_SENT; + } + return 0; +} + +static int ds_handshake(struct ds_info *dp, struct ds_msg_tag *pkt) +{ + + if (dp->hs_state == DS_HS_START) { + if (pkt->type != DS_INIT_ACK) + goto conn_reset; + + dp->hs_state = DS_HS_DONE; + + return register_services(dp); + } + + if (dp->hs_state != DS_HS_DONE) + goto conn_reset; + + if (pkt->type == DS_REG_ACK) { + struct ds_reg_ack *ap = (struct ds_reg_ack *) pkt; + struct ds_cap_state *cp = find_cap(ap->handle); + + if (!cp) { + printk(KERN_ERR PFX "REG ACK for unknown handle %lx\n", + ap->handle); + return 0; + } + printk(KERN_INFO PFX "Registered %s service.\n", + cp->service_id); + cp->state = CAP_STATE_REGISTERED; + } else if (pkt->type == DS_REG_NACK) { + struct ds_reg_nack *np = (struct ds_reg_nack *) pkt; + struct ds_cap_state *cp = find_cap(np->handle); + + if (!cp) { + printk(KERN_ERR PFX "REG NACK for " + "unknown handle %lx\n", + np->handle); + return 0; + } + printk(KERN_INFO PFX "Could not register %s service\n", + cp->service_id); + cp->state = CAP_STATE_UNKNOWN; + } + + return 0; + +conn_reset: + ds_conn_reset(dp); + return -ECONNRESET; +} + +static int ds_data(struct ds_info *dp, struct ds_msg_tag *pkt, int len) +{ + struct ds_data *dpkt = (struct ds_data *) pkt; + struct ds_cap_state *cp = find_cap(dpkt->handle); + + if (!cp) { + struct ds_data_nack nack = { + .tag = { + .type = DS_NACK, + .len = (sizeof(struct ds_data_nack) - + sizeof(struct ds_msg_tag)), + }, + .handle = dpkt->handle, + .result = DS_INV_HDL, + }; + + printk(KERN_ERR PFX "Data for unknown handle %lu\n", + dpkt->handle); + ds_send(dp->lp, &nack, sizeof(nack)); + } else { + cp->data(dp->lp, cp, dpkt, len); + } + return 0; +} + +static void ds_up(struct ds_info *dp) +{ + struct ldc_channel *lp = dp->lp; + struct ds_ver_req req; + int err; + + req.tag.type = DS_INIT_REQ; + req.tag.len = sizeof(req) - sizeof(struct ds_msg_tag); + req.ver.major = 1; + req.ver.minor = 0; + + err = ds_send(lp, &req, sizeof(req)); + if (err > 0) + dp->hs_state = DS_HS_START; +} + +static void ds_event(void *arg, int event) +{ + struct ds_info *dp = arg; + struct ldc_channel *lp = dp->lp; + unsigned long flags; + int err; + + spin_lock_irqsave(&ds_lock, flags); + + if (event == LDC_EVENT_UP) { + ds_up(dp); + spin_unlock_irqrestore(&ds_lock, flags); + return; + } + + if (event != LDC_EVENT_DATA_READY) { + printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event); + spin_unlock_irqrestore(&ds_lock, flags); + return; + } + + err = 0; + while (1) { + struct ds_msg_tag *tag; + + err = ldc_read(lp, dp->rcv_buf, sizeof(*tag)); + + if (unlikely(err < 0)) { + if (err == -ECONNRESET) + ds_conn_reset(dp); + break; + } + if (err == 0) + break; + + tag = dp->rcv_buf; + err = ldc_read(lp, tag + 1, tag->len); + + if (unlikely(err < 0)) { + if (err == -ECONNRESET) + ds_conn_reset(dp); + break; + } + if (err < tag->len) + break; + + if (tag->type < DS_DATA) + err = ds_handshake(dp, dp->rcv_buf); + else + err = ds_data(dp, dp->rcv_buf, + sizeof(*tag) + err); + if (err == -ECONNRESET) + break; + } + + spin_unlock_irqrestore(&ds_lock, flags); +} + +static int __devinit ds_probe(struct vio_dev *vdev, + const struct vio_device_id *id) +{ + static int ds_version_printed; + struct ldc_channel_config ds_cfg = { + .event = ds_event, + .mtu = 4096, + .mode = LDC_MODE_STREAM, + }; + struct ldc_channel *lp; + struct ds_info *dp; + int err; + + if (ds_version_printed++ == 0) + printk(KERN_INFO "%s", version); + + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + err = -ENOMEM; + if (!dp) + goto out_err; + + dp->rcv_buf = kzalloc(4096, GFP_KERNEL); + if (!dp->rcv_buf) + goto out_free_dp; + + dp->rcv_buf_len = 4096; + + ds_cfg.tx_irq = vdev->tx_irq; + ds_cfg.rx_irq = vdev->rx_irq; + + lp = ldc_alloc(vdev->channel_id, &ds_cfg, dp); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out_free_rcv_buf; + } + dp->lp = lp; + + err = ldc_bind(lp, "DS"); + if (err) + goto out_free_ldc; + + ds_info = dp; + + start_powerd(); + + return err; + +out_free_ldc: + ldc_free(dp->lp); + +out_free_rcv_buf: + kfree(dp->rcv_buf); + +out_free_dp: + kfree(dp); + +out_err: + return err; +} + +static int ds_remove(struct vio_dev *vdev) +{ + return 0; +} + +static struct vio_device_id ds_match[] = { + { + .type = "domain-services-port", + }, + {}, +}; + +static struct vio_driver ds_driver = { + .id_table = ds_match, + .probe = ds_probe, + .remove = ds_remove, + .driver = { + .name = "ds", + .owner = THIS_MODULE, + } +}; + +static int __init ds_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ds_states); i++) + ds_states[i].handle = ((u64)i << 32); + +#ifdef CONFIG_HOTPLUG_CPU + kthread_run(dr_cpu_thread, NULL, "kdrcpud"); +#endif + + return vio_register_driver(&ds_driver); +} + +subsys_initcall(ds_init); diff --git a/arch/sparc64/kernel/hvtramp.S b/arch/sparc64/kernel/hvtramp.S new file mode 100644 index 000000000000..76a090e2c2a8 --- /dev/null +++ b/arch/sparc64/kernel/hvtramp.S @@ -0,0 +1,139 @@ +/* hvtramp.S: Hypervisor start-cpu trampoline code. + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#include <asm/thread_info.h> +#include <asm/hypervisor.h> +#include <asm/scratchpad.h> +#include <asm/spitfire.h> +#include <asm/hvtramp.h> +#include <asm/pstate.h> +#include <asm/ptrace.h> +#include <asm/asi.h> + + .text + .align 8 + .globl hv_cpu_startup, hv_cpu_startup_end + + /* This code executes directly out of the hypervisor + * with physical addressing (va==pa). %o0 contains + * our client argument which for Linux points to + * a descriptor data structure which defines the + * MMU entries we need to load up. + * + * After we set things up we enable the MMU and call + * into the kernel. + * + * First setup basic privileged cpu state. + */ +hv_cpu_startup: + wrpr %g0, 0, %gl + wrpr %g0, 15, %pil + wrpr %g0, 0, %canrestore + wrpr %g0, 0, %otherwin + wrpr %g0, 6, %cansave + wrpr %g0, 6, %cleanwin + wrpr %g0, 0, %cwp + wrpr %g0, 0, %wstate + wrpr %g0, 0, %tl + + sethi %hi(sparc64_ttable_tl0), %g1 + wrpr %g1, %tba + + mov %o0, %l0 + + lduw [%l0 + HVTRAMP_DESCR_CPU], %g1 + mov SCRATCHPAD_CPUID, %g2 + stxa %g1, [%g2] ASI_SCRATCHPAD + + ldx [%l0 + HVTRAMP_DESCR_FAULT_INFO_VA], %g2 + stxa %g2, [%g0] ASI_SCRATCHPAD + + mov 0, %l1 + lduw [%l0 + HVTRAMP_DESCR_NUM_MAPPINGS], %l2 + add %l0, HVTRAMP_DESCR_MAPS, %l3 + +1: ldx [%l3 + HVTRAMP_MAPPING_VADDR], %o0 + clr %o1 + ldx [%l3 + HVTRAMP_MAPPING_TTE], %o2 + mov HV_MMU_IMMU | HV_MMU_DMMU, %o3 + mov HV_FAST_MMU_MAP_PERM_ADDR, %o5 + ta HV_FAST_TRAP + + brnz,pn %o0, 80f + nop + + add %l1, 1, %l1 + cmp %l1, %l2 + blt,a,pt %xcc, 1b + add %l3, HVTRAMP_MAPPING_SIZE, %l3 + + ldx [%l0 + HVTRAMP_DESCR_FAULT_INFO_PA], %o0 + mov HV_FAST_MMU_FAULT_AREA_CONF, %o5 + ta HV_FAST_TRAP + + brnz,pn %o0, 80f + nop + + wrpr %g0, (PSTATE_PRIV | PSTATE_PEF), %pstate + + ldx [%l0 + HVTRAMP_DESCR_THREAD_REG], %l6 + + mov 1, %o0 + set 1f, %o1 + mov HV_FAST_MMU_ENABLE, %o5 + ta HV_FAST_TRAP + + ba,pt %xcc, 80f + nop + +1: + wr %g0, 0, %fprs + wr %g0, ASI_P, %asi + + mov PRIMARY_CONTEXT, %g7 + stxa %g0, [%g7] ASI_MMU + membar #Sync + + mov SECONDARY_CONTEXT, %g7 + stxa %g0, [%g7] ASI_MMU + membar #Sync + + mov %l6, %g6 + ldx [%g6 + TI_TASK], %g4 + + mov 1, %g5 + sllx %g5, THREAD_SHIFT, %g5 + sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5 + add %g6, %g5, %sp + mov 0, %fp + + call init_irqwork_curcpu + nop + call hard_smp_processor_id + nop + + mov %o0, %o1 + mov 0, %o0 + mov 0, %o2 + call sun4v_init_mondo_queues + mov 1, %o3 + + call init_cur_cpu_trap + mov %g6, %o0 + + wrpr %g0, (PSTATE_PRIV | PSTATE_PEF | PSTATE_IE), %pstate + + call smp_callin + nop + call cpu_idle + mov 0, %o0 + call cpu_panic + nop + +80: ba,pt %xcc, 80b + nop + + .align 8 +hv_cpu_startup_end: diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index 6b6165d36fd8..8cb3358674f5 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -293,6 +293,11 @@ static void sun4u_irq_enable(unsigned int virt_irq) } } +static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask) +{ + sun4u_irq_enable(virt_irq); +} + static void sun4u_irq_disable(unsigned int virt_irq) { struct irq_handler_data *data = get_irq_chip_data(virt_irq); @@ -309,6 +314,10 @@ static void sun4u_irq_disable(unsigned int virt_irq) static void sun4u_irq_end(unsigned int virt_irq) { struct irq_handler_data *data = get_irq_chip_data(virt_irq); + struct irq_desc *desc = irq_desc + virt_irq; + + if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS))) + return; if (likely(data)) upa_writeq(ICLR_IDLE, data->iclr); @@ -340,6 +349,24 @@ static void sun4v_irq_enable(unsigned int virt_irq) } } +static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask) +{ + struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq); + unsigned int ino = bucket - &ivector_table[0]; + + if (likely(bucket)) { + unsigned long cpuid; + int err; + + cpuid = irq_choose_cpu(virt_irq); + + err = sun4v_intr_settarget(ino, cpuid); + if (err != HV_EOK) + printk("sun4v_intr_settarget(%x,%lu): err(%d)\n", + ino, cpuid, err); + } +} + static void sun4v_irq_disable(unsigned int virt_irq) { struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq); @@ -373,6 +400,10 @@ static void sun4v_irq_end(unsigned int virt_irq) { struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq); unsigned int ino = bucket - &ivector_table[0]; + struct irq_desc *desc = irq_desc + virt_irq; + + if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS))) + return; if (likely(bucket)) { int err; @@ -418,6 +449,28 @@ static void sun4v_virq_enable(unsigned int virt_irq) } } +static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask) +{ + struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq); + unsigned int ino = bucket - &ivector_table[0]; + + if (likely(bucket)) { + unsigned long cpuid, dev_handle, dev_ino; + int err; + + cpuid = irq_choose_cpu(virt_irq); + + dev_handle = ino & IMAP_IGN; + dev_ino = ino & IMAP_INO; + + err = sun4v_vintr_set_target(dev_handle, dev_ino, cpuid); + if (err != HV_EOK) + printk("sun4v_vintr_set_target(%lx,%lx,%lu): " + "err(%d)\n", + dev_handle, dev_ino, cpuid, err); + } +} + static void sun4v_virq_disable(unsigned int virt_irq) { struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq); @@ -443,6 +496,10 @@ static void sun4v_virq_end(unsigned int virt_irq) { struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq); unsigned int ino = bucket - &ivector_table[0]; + struct irq_desc *desc = irq_desc + virt_irq; + + if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS))) + return; if (likely(bucket)) { unsigned long dev_handle, dev_ino; @@ -477,6 +534,7 @@ static struct irq_chip sun4u_irq = { .enable = sun4u_irq_enable, .disable = sun4u_irq_disable, .end = sun4u_irq_end, + .set_affinity = sun4u_set_affinity, }; static struct irq_chip sun4u_irq_ack = { @@ -485,6 +543,7 @@ static struct irq_chip sun4u_irq_ack = { .disable = sun4u_irq_disable, .ack = run_pre_handler, .end = sun4u_irq_end, + .set_affinity = sun4u_set_affinity, }; static struct irq_chip sun4v_irq = { @@ -492,6 +551,7 @@ static struct irq_chip sun4v_irq = { .enable = sun4v_irq_enable, .disable = sun4v_irq_disable, .end = sun4v_irq_end, + .set_affinity = sun4v_set_affinity, }; static struct irq_chip sun4v_irq_ack = { @@ -500,6 +560,7 @@ static struct irq_chip sun4v_irq_ack = { .disable = sun4v_irq_disable, .ack = run_pre_handler, .end = sun4v_irq_end, + .set_affinity = sun4v_set_affinity, }; #ifdef CONFIG_PCI_MSI @@ -511,6 +572,7 @@ static struct irq_chip sun4v_msi = { .disable = sun4v_msi_disable, .ack = run_pre_handler, .end = sun4v_irq_end, + .set_affinity = sun4v_set_affinity, }; #endif @@ -519,6 +581,7 @@ static struct irq_chip sun4v_virq = { .enable = sun4v_virq_enable, .disable = sun4v_virq_disable, .end = sun4v_virq_end, + .set_affinity = sun4v_virt_set_affinity, }; static struct irq_chip sun4v_virq_ack = { @@ -527,6 +590,7 @@ static struct irq_chip sun4v_virq_ack = { .disable = sun4v_virq_disable, .ack = run_pre_handler, .end = sun4v_virq_end, + .set_affinity = sun4v_virt_set_affinity, }; void irq_install_pre_handler(int virt_irq, @@ -739,6 +803,26 @@ void handler_irq(int irq, struct pt_regs *regs) set_irq_regs(old_regs); } +#ifdef CONFIG_HOTPLUG_CPU +void fixup_irqs(void) +{ + unsigned int irq; + + for (irq = 0; irq < NR_IRQS; irq++) { + unsigned long flags; + + spin_lock_irqsave(&irq_desc[irq].lock, flags); + if (irq_desc[irq].action && + !(irq_desc[irq].status & IRQ_PER_CPU)) { + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, + irq_desc[irq].affinity); + } + spin_unlock_irqrestore(&irq_desc[irq].lock, flags); + } +} +#endif + struct sun5_timer { u64 count0; u64 limit0; diff --git a/arch/sparc64/kernel/ldc.c b/arch/sparc64/kernel/ldc.c new file mode 100644 index 000000000000..85a2be0b0962 --- /dev/null +++ b/arch/sparc64/kernel/ldc.c @@ -0,0 +1,2373 @@ +/* ldc.c: Logical Domain Channel link-layer protocol driver. + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/scatterlist.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/init.h> + +#include <asm/hypervisor.h> +#include <asm/iommu.h> +#include <asm/page.h> +#include <asm/ldc.h> +#include <asm/mdesc.h> + +#define DRV_MODULE_NAME "ldc" +#define PFX DRV_MODULE_NAME ": " +#define DRV_MODULE_VERSION "1.0" +#define DRV_MODULE_RELDATE "June 25, 2007" + +static char version[] __devinitdata = + DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; +#define LDC_PACKET_SIZE 64 + +/* Packet header layout for unreliable and reliable mode frames. + * When in RAW mode, packets are simply straight 64-byte payloads + * with no headers. + */ +struct ldc_packet { + u8 type; +#define LDC_CTRL 0x01 +#define LDC_DATA 0x02 +#define LDC_ERR 0x10 + + u8 stype; +#define LDC_INFO 0x01 +#define LDC_ACK 0x02 +#define LDC_NACK 0x04 + + u8 ctrl; +#define LDC_VERS 0x01 /* Link Version */ +#define LDC_RTS 0x02 /* Request To Send */ +#define LDC_RTR 0x03 /* Ready To Receive */ +#define LDC_RDX 0x04 /* Ready for Data eXchange */ +#define LDC_CTRL_MSK 0x0f + + u8 env; +#define LDC_LEN 0x3f +#define LDC_FRAG_MASK 0xc0 +#define LDC_START 0x40 +#define LDC_STOP 0x80 + + u32 seqid; + + union { + u8 u_data[LDC_PACKET_SIZE - 8]; + struct { + u32 pad; + u32 ackid; + u8 r_data[LDC_PACKET_SIZE - 8 - 8]; + } r; + } u; +}; + +struct ldc_version { + u16 major; + u16 minor; +}; + +/* Ordered from largest major to lowest. */ +static struct ldc_version ver_arr[] = { + { .major = 1, .minor = 0 }, +}; + +#define LDC_DEFAULT_MTU (4 * LDC_PACKET_SIZE) +#define LDC_DEFAULT_NUM_ENTRIES (PAGE_SIZE / LDC_PACKET_SIZE) + +struct ldc_channel; + +struct ldc_mode_ops { + int (*write)(struct ldc_channel *, const void *, unsigned int); + int (*read)(struct ldc_channel *, void *, unsigned int); +}; + +static const struct ldc_mode_ops raw_ops; +static const struct ldc_mode_ops nonraw_ops; +static const struct ldc_mode_ops stream_ops; + +int ldom_domaining_enabled; + +struct ldc_iommu { + /* Protects arena alloc/free. */ + spinlock_t lock; + struct iommu_arena arena; + struct ldc_mtable_entry *page_table; +}; + +struct ldc_channel { + /* Protects all operations that depend upon channel state. */ + spinlock_t lock; + + unsigned long id; + + u8 *mssbuf; + u32 mssbuf_len; + u32 mssbuf_off; + + struct ldc_packet *tx_base; + unsigned long tx_head; + unsigned long tx_tail; + unsigned long tx_num_entries; + unsigned long tx_ra; + + unsigned long tx_acked; + + struct ldc_packet *rx_base; + unsigned long rx_head; + unsigned long rx_tail; + unsigned long rx_num_entries; + unsigned long rx_ra; + + u32 rcv_nxt; + u32 snd_nxt; + + unsigned long chan_state; + + struct ldc_channel_config cfg; + void *event_arg; + + const struct ldc_mode_ops *mops; + + struct ldc_iommu iommu; + + struct ldc_version ver; + + u8 hs_state; +#define LDC_HS_CLOSED 0x00 +#define LDC_HS_OPEN 0x01 +#define LDC_HS_GOTVERS 0x02 +#define LDC_HS_SENTRTR 0x03 +#define LDC_HS_GOTRTR 0x04 +#define LDC_HS_COMPLETE 0x10 + + u8 flags; +#define LDC_FLAG_ALLOCED_QUEUES 0x01 +#define LDC_FLAG_REGISTERED_QUEUES 0x02 +#define LDC_FLAG_REGISTERED_IRQS 0x04 +#define LDC_FLAG_RESET 0x10 + + u8 mss; + u8 state; + +#define LDC_IRQ_NAME_MAX 32 + char rx_irq_name[LDC_IRQ_NAME_MAX]; + char tx_irq_name[LDC_IRQ_NAME_MAX]; + + struct hlist_head mh_list; + + struct hlist_node list; +}; + +#define ldcdbg(TYPE, f, a...) \ +do { if (lp->cfg.debug & LDC_DEBUG_##TYPE) \ + printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \ +} while (0) + +static const char *state_to_str(u8 state) +{ + switch (state) { + case LDC_STATE_INVALID: + return "INVALID"; + case LDC_STATE_INIT: + return "INIT"; + case LDC_STATE_BOUND: + return "BOUND"; + case LDC_STATE_READY: + return "READY"; + case LDC_STATE_CONNECTED: + return "CONNECTED"; + default: + return "<UNKNOWN>"; + } +} + +static void ldc_set_state(struct ldc_channel *lp, u8 state) +{ + ldcdbg(STATE, "STATE (%s) --> (%s)\n", + state_to_str(lp->state), + state_to_str(state)); + + lp->state = state; +} + +static unsigned long __advance(unsigned long off, unsigned long num_entries) +{ + off += LDC_PACKET_SIZE; + if (off == (num_entries * LDC_PACKET_SIZE)) + off = 0; + + return off; +} + +static unsigned long rx_advance(struct ldc_channel *lp, unsigned long off) +{ + return __advance(off, lp->rx_num_entries); +} + +static unsigned long tx_advance(struct ldc_channel *lp, unsigned long off) +{ + return __advance(off, lp->tx_num_entries); +} + +static struct ldc_packet *handshake_get_tx_packet(struct ldc_channel *lp, + unsigned long *new_tail) +{ + struct ldc_packet *p; + unsigned long t; + + t = tx_advance(lp, lp->tx_tail); + if (t == lp->tx_head) + return NULL; + + *new_tail = t; + + p = lp->tx_base; + return p + (lp->tx_tail / LDC_PACKET_SIZE); +} + +/* When we are in reliable or stream mode, have to track the next packet + * we haven't gotten an ACK for in the TX queue using tx_acked. We have + * to be careful not to stomp over the queue past that point. During + * the handshake, we don't have TX data packets pending in the queue + * and that's why handshake_get_tx_packet() need not be mindful of + * lp->tx_acked. + */ +static unsigned long head_for_data(struct ldc_channel *lp) +{ + if (lp->cfg.mode == LDC_MODE_STREAM) + return lp->tx_acked; + return lp->tx_head; +} + +static int tx_has_space_for(struct ldc_channel *lp, unsigned int size) +{ + unsigned long limit, tail, new_tail, diff; + unsigned int mss; + + limit = head_for_data(lp); + tail = lp->tx_tail; + new_tail = tx_advance(lp, tail); + if (new_tail == limit) + return 0; + + if (limit > new_tail) + diff = limit - new_tail; + else + diff = (limit + + ((lp->tx_num_entries * LDC_PACKET_SIZE) - new_tail)); + diff /= LDC_PACKET_SIZE; + mss = lp->mss; + + if (diff * mss < size) + return 0; + + return 1; +} + +static struct ldc_packet *data_get_tx_packet(struct ldc_channel *lp, + unsigned long *new_tail) +{ + struct ldc_packet *p; + unsigned long h, t; + + h = head_for_data(lp); + t = tx_advance(lp, lp->tx_tail); + if (t == h) + return NULL; + + *new_tail = t; + + p = lp->tx_base; + return p + (lp->tx_tail / LDC_PACKET_SIZE); +} + +static int set_tx_tail(struct ldc_channel *lp, unsigned long tail) +{ + unsigned long orig_tail = lp->tx_tail; + int limit = 1000; + + lp->tx_tail = tail; + while (limit-- > 0) { + unsigned long err; + + err = sun4v_ldc_tx_set_qtail(lp->id, tail); + if (!err) + return 0; + + if (err != HV_EWOULDBLOCK) { + lp->tx_tail = orig_tail; + return -EINVAL; + } + udelay(1); + } + + lp->tx_tail = orig_tail; + return -EBUSY; +} + +/* This just updates the head value in the hypervisor using + * a polling loop with a timeout. The caller takes care of + * upating software state representing the head change, if any. + */ +static int __set_rx_head(struct ldc_channel *lp, unsigned long head) +{ + int limit = 1000; + + while (limit-- > 0) { + unsigned long err; + + err = sun4v_ldc_rx_set_qhead(lp->id, head); + if (!err) + return 0; + + if (err != HV_EWOULDBLOCK) + return -EINVAL; + + udelay(1); + } + + return -EBUSY; +} + +static int send_tx_packet(struct ldc_channel *lp, + struct ldc_packet *p, + unsigned long new_tail) +{ + BUG_ON(p != (lp->tx_base + (lp->tx_tail / LDC_PACKET_SIZE))); + + return set_tx_tail(lp, new_tail); +} + +static struct ldc_packet *handshake_compose_ctrl(struct ldc_channel *lp, + u8 stype, u8 ctrl, + void *data, int dlen, + unsigned long *new_tail) +{ + struct ldc_packet *p = handshake_get_tx_packet(lp, new_tail); + + if (p) { + memset(p, 0, sizeof(*p)); + p->type = LDC_CTRL; + p->stype = stype; + p->ctrl = ctrl; + if (data) + memcpy(p->u.u_data, data, dlen); + } + return p; +} + +static int start_handshake(struct ldc_channel *lp) +{ + struct ldc_packet *p; + struct ldc_version *ver; + unsigned long new_tail; + + ver = &ver_arr[0]; + + ldcdbg(HS, "SEND VER INFO maj[%u] min[%u]\n", + ver->major, ver->minor); + + p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS, + ver, sizeof(*ver), &new_tail); + if (p) { + int err = send_tx_packet(lp, p, new_tail); + if (!err) + lp->flags &= ~LDC_FLAG_RESET; + return err; + } + return -EBUSY; +} + +static int send_version_nack(struct ldc_channel *lp, + u16 major, u16 minor) +{ + struct ldc_packet *p; + struct ldc_version ver; + unsigned long new_tail; + + ver.major = major; + ver.minor = minor; + + p = handshake_compose_ctrl(lp, LDC_NACK, LDC_VERS, + &ver, sizeof(ver), &new_tail); + if (p) { + ldcdbg(HS, "SEND VER NACK maj[%u] min[%u]\n", + ver.major, ver.minor); + + return send_tx_packet(lp, p, new_tail); + } + return -EBUSY; +} + +static int send_version_ack(struct ldc_channel *lp, + struct ldc_version *vp) +{ + struct ldc_packet *p; + unsigned long new_tail; + + p = handshake_compose_ctrl(lp, LDC_ACK, LDC_VERS, + vp, sizeof(*vp), &new_tail); + if (p) { + ldcdbg(HS, "SEND VER ACK maj[%u] min[%u]\n", + vp->major, vp->minor); + + return send_tx_packet(lp, p, new_tail); + } + return -EBUSY; +} + +static int send_rts(struct ldc_channel *lp) +{ + struct ldc_packet *p; + unsigned long new_tail; + + p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTS, NULL, 0, + &new_tail); + if (p) { + p->env = lp->cfg.mode; + p->seqid = 0; + lp->rcv_nxt = 0; + + ldcdbg(HS, "SEND RTS env[0x%x] seqid[0x%x]\n", + p->env, p->seqid); + + return send_tx_packet(lp, p, new_tail); + } + return -EBUSY; +} + +static int send_rtr(struct ldc_channel *lp) +{ + struct ldc_packet *p; + unsigned long new_tail; + + p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTR, NULL, 0, + &new_tail); + if (p) { + p->env = lp->cfg.mode; + p->seqid = 0; + + ldcdbg(HS, "SEND RTR env[0x%x] seqid[0x%x]\n", + p->env, p->seqid); + + return send_tx_packet(lp, p, new_tail); + } + return -EBUSY; +} + +static int send_rdx(struct ldc_channel *lp) +{ + struct ldc_packet *p; + unsigned long new_tail; + + p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RDX, NULL, 0, + &new_tail); + if (p) { + p->env = 0; + p->seqid = ++lp->snd_nxt; + p->u.r.ackid = lp->rcv_nxt; + + ldcdbg(HS, "SEND RDX env[0x%x] seqid[0x%x] ackid[0x%x]\n", + p->env, p->seqid, p->u.r.ackid); + + return send_tx_packet(lp, p, new_tail); + } + return -EBUSY; +} + +static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt) +{ + struct ldc_packet *p; + unsigned long new_tail; + int err; + + p = data_get_tx_packet(lp, &new_tail); + if (!p) + return -EBUSY; + memset(p, 0, sizeof(*p)); + p->type = data_pkt->type; + p->stype = LDC_NACK; + p->ctrl = data_pkt->ctrl & LDC_CTRL_MSK; + p->seqid = lp->snd_nxt + 1; + p->u.r.ackid = lp->rcv_nxt; + + ldcdbg(HS, "SEND DATA NACK type[0x%x] ctl[0x%x] seq[0x%x] ack[0x%x]\n", + p->type, p->ctrl, p->seqid, p->u.r.ackid); + + err = send_tx_packet(lp, p, new_tail); + if (!err) + lp->snd_nxt++; + + return err; +} + +static int ldc_abort(struct ldc_channel *lp) +{ + unsigned long hv_err; + + ldcdbg(STATE, "ABORT\n"); + + /* We report but do not act upon the hypervisor errors because + * there really isn't much we can do if they fail at this point. + */ + hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); + if (hv_err) + printk(KERN_ERR PFX "ldc_abort: " + "sun4v_ldc_tx_qconf(%lx,%lx,%lx) failed, err=%lu\n", + lp->id, lp->tx_ra, lp->tx_num_entries, hv_err); + + hv_err = sun4v_ldc_tx_get_state(lp->id, + &lp->tx_head, + &lp->tx_tail, + &lp->chan_state); + if (hv_err) + printk(KERN_ERR PFX "ldc_abort: " + "sun4v_ldc_tx_get_state(%lx,...) failed, err=%lu\n", + lp->id, hv_err); + + hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); + if (hv_err) + printk(KERN_ERR PFX "ldc_abort: " + "sun4v_ldc_rx_qconf(%lx,%lx,%lx) failed, err=%lu\n", + lp->id, lp->rx_ra, lp->rx_num_entries, hv_err); + + /* Refetch the RX queue state as well, because we could be invoked + * here in the queue processing context. + */ + hv_err = sun4v_ldc_rx_get_state(lp->id, + &lp->rx_head, + &lp->rx_tail, + &lp->chan_state); + if (hv_err) + printk(KERN_ERR PFX "ldc_abort: " + "sun4v_ldc_rx_get_state(%lx,...) failed, err=%lu\n", + lp->id, hv_err); + + return -ECONNRESET; +} + +static struct ldc_version *find_by_major(u16 major) +{ + struct ldc_version *ret = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(ver_arr); i++) { + struct ldc_version *v = &ver_arr[i]; + if (v->major <= major) { + ret = v; + break; + } + } + return ret; +} + +static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp) +{ + struct ldc_version *vap; + int err; + + ldcdbg(HS, "GOT VERSION INFO major[%x] minor[%x]\n", + vp->major, vp->minor); + + if (lp->hs_state == LDC_HS_GOTVERS) { + lp->hs_state = LDC_HS_OPEN; + memset(&lp->ver, 0, sizeof(lp->ver)); + } + + vap = find_by_major(vp->major); + if (!vap) { + err = send_version_nack(lp, 0, 0); + } else if (vap->major != vp->major) { + err = send_version_nack(lp, vap->major, vap->minor); + } else { + struct ldc_version ver = *vp; + if (ver.minor > vap->minor) + ver.minor = vap->minor; + err = send_version_ack(lp, &ver); + if (!err) { + lp->ver = ver; + lp->hs_state = LDC_HS_GOTVERS; + } + } + if (err) + return ldc_abort(lp); + + return 0; +} + +static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp) +{ + ldcdbg(HS, "GOT VERSION ACK major[%x] minor[%x]\n", + vp->major, vp->minor); + + if (lp->hs_state == LDC_HS_GOTVERS) { + if (lp->ver.major != vp->major || + lp->ver.minor != vp->minor) + return ldc_abort(lp); + } else { + lp->ver = *vp; + lp->hs_state = LDC_HS_GOTVERS; + } + if (send_rts(lp)) + return ldc_abort(lp); + return 0; +} + +static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp) +{ + struct ldc_version *vap; + + if ((vp->major == 0 && vp->minor == 0) || + !(vap = find_by_major(vp->major))) { + return ldc_abort(lp); + } else { + struct ldc_packet *p; + unsigned long new_tail; + + p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS, + vap, sizeof(*vap), + &new_tail); + if (p) + return send_tx_packet(lp, p, new_tail); + else + return ldc_abort(lp); + } +} + +static int process_version(struct ldc_channel *lp, + struct ldc_packet *p) +{ + struct ldc_version *vp; + + vp = (struct ldc_version *) p->u.u_data; + + switch (p->stype) { + case LDC_INFO: + return process_ver_info(lp, vp); + + case LDC_ACK: + return process_ver_ack(lp, vp); + + case LDC_NACK: + return process_ver_nack(lp, vp); + + default: + return ldc_abort(lp); + } +} + +static int process_rts(struct ldc_channel *lp, + struct ldc_packet *p) +{ + ldcdbg(HS, "GOT RTS stype[%x] seqid[%x] env[%x]\n", + p->stype, p->seqid, p->env); + + if (p->stype != LDC_INFO || + lp->hs_state != LDC_HS_GOTVERS || + p->env != lp->cfg.mode) + return ldc_abort(lp); + + lp->snd_nxt = p->seqid; + lp->rcv_nxt = p->seqid; + lp->hs_state = LDC_HS_SENTRTR; + if (send_rtr(lp)) + return ldc_abort(lp); + + return 0; +} + +static int process_rtr(struct ldc_channel *lp, + struct ldc_packet *p) +{ + ldcdbg(HS, "GOT RTR stype[%x] seqid[%x] env[%x]\n", + p->stype, p->seqid, p->env); + + if (p->stype != LDC_INFO || + p->env != lp->cfg.mode) + return ldc_abort(lp); + + lp->snd_nxt = p->seqid; + lp->hs_state = LDC_HS_COMPLETE; + ldc_set_state(lp, LDC_STATE_CONNECTED); + send_rdx(lp); + + return LDC_EVENT_UP; +} + +static int rx_seq_ok(struct ldc_channel *lp, u32 seqid) +{ + return lp->rcv_nxt + 1 == seqid; +} + +static int process_rdx(struct ldc_channel *lp, + struct ldc_packet *p) +{ + ldcdbg(HS, "GOT RDX stype[%x] seqid[%x] env[%x] ackid[%x]\n", + p->stype, p->seqid, p->env, p->u.r.ackid); + + if (p->stype != LDC_INFO || + !(rx_seq_ok(lp, p->seqid))) + return ldc_abort(lp); + + lp->rcv_nxt = p->seqid; + + lp->hs_state = LDC_HS_COMPLETE; + ldc_set_state(lp, LDC_STATE_CONNECTED); + + return LDC_EVENT_UP; +} + +static int process_control_frame(struct ldc_channel *lp, + struct ldc_packet *p) +{ + switch (p->ctrl) { + case LDC_VERS: + return process_version(lp, p); + + case LDC_RTS: + return process_rts(lp, p); + + case LDC_RTR: + return process_rtr(lp, p); + + case LDC_RDX: + return process_rdx(lp, p); + + default: + return ldc_abort(lp); + } +} + +static int process_error_frame(struct ldc_channel *lp, + struct ldc_packet *p) +{ + return ldc_abort(lp); +} + +static int process_data_ack(struct ldc_channel *lp, + struct ldc_packet *ack) +{ + unsigned long head = lp->tx_acked; + u32 ackid = ack->u.r.ackid; + + while (1) { + struct ldc_packet *p = lp->tx_base + (head / LDC_PACKET_SIZE); + + head = tx_advance(lp, head); + + if (p->seqid == ackid) { + lp->tx_acked = head; + return 0; + } + if (head == lp->tx_tail) + return ldc_abort(lp); + } + + return 0; +} + +static void send_events(struct ldc_channel *lp, unsigned int event_mask) +{ + if (event_mask & LDC_EVENT_RESET) + lp->cfg.event(lp->event_arg, LDC_EVENT_RESET); + if (event_mask & LDC_EVENT_UP) + lp->cfg.event(lp->event_arg, LDC_EVENT_UP); + if (event_mask & LDC_EVENT_DATA_READY) + lp->cfg.event(lp->event_arg, LDC_EVENT_DATA_READY); +} + +static irqreturn_t ldc_rx(int irq, void *dev_id) +{ + struct ldc_channel *lp = dev_id; + unsigned long orig_state, hv_err, flags; + unsigned int event_mask; + + spin_lock_irqsave(&lp->lock, flags); + + orig_state = lp->chan_state; + hv_err = sun4v_ldc_rx_get_state(lp->id, + &lp->rx_head, + &lp->rx_tail, + &lp->chan_state); + + ldcdbg(RX, "RX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n", + orig_state, lp->chan_state, lp->rx_head, lp->rx_tail); + + event_mask = 0; + + if (lp->cfg.mode == LDC_MODE_RAW && + lp->chan_state == LDC_CHANNEL_UP) { + lp->hs_state = LDC_HS_COMPLETE; + ldc_set_state(lp, LDC_STATE_CONNECTED); + + event_mask |= LDC_EVENT_UP; + + orig_state = lp->chan_state; + } + + /* If we are in reset state, flush the RX queue and ignore + * everything. + */ + if (lp->flags & LDC_FLAG_RESET) { + (void) __set_rx_head(lp, lp->rx_tail); + goto out; + } + + /* Once we finish the handshake, we let the ldc_read() + * paths do all of the control frame and state management. + * Just trigger the callback. + */ + if (lp->hs_state == LDC_HS_COMPLETE) { +handshake_complete: + if (lp->chan_state != orig_state) { + unsigned int event = LDC_EVENT_RESET; + + if (lp->chan_state == LDC_CHANNEL_UP) + event = LDC_EVENT_UP; + + event_mask |= event; + } + if (lp->rx_head != lp->rx_tail) + event_mask |= LDC_EVENT_DATA_READY; + + goto out; + } + + if (lp->chan_state != orig_state) + goto out; + + while (lp->rx_head != lp->rx_tail) { + struct ldc_packet *p; + unsigned long new; + int err; + + p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE); + + switch (p->type) { + case LDC_CTRL: + err = process_control_frame(lp, p); + if (err > 0) + event_mask |= err; + break; + + case LDC_DATA: + event_mask |= LDC_EVENT_DATA_READY; + err = 0; + break; + + case LDC_ERR: + err = process_error_frame(lp, p); + break; + + default: + err = ldc_abort(lp); + break; + } + + if (err < 0) + break; + + new = lp->rx_head; + new += LDC_PACKET_SIZE; + if (new == (lp->rx_num_entries * LDC_PACKET_SIZE)) + new = 0; + lp->rx_head = new; + + err = __set_rx_head(lp, new); + if (err < 0) { + (void) ldc_abort(lp); + break; + } + if (lp->hs_state == LDC_HS_COMPLETE) + goto handshake_complete; + } + +out: + spin_unlock_irqrestore(&lp->lock, flags); + + send_events(lp, event_mask); + + return IRQ_HANDLED; +} + +static irqreturn_t ldc_tx(int irq, void *dev_id) +{ + struct ldc_channel *lp = dev_id; + unsigned long flags, hv_err, orig_state; + unsigned int event_mask = 0; + + spin_lock_irqsave(&lp->lock, flags); + + orig_state = lp->chan_state; + hv_err = sun4v_ldc_tx_get_state(lp->id, + &lp->tx_head, + &lp->tx_tail, + &lp->chan_state); + + ldcdbg(TX, " TX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n", + orig_state, lp->chan_state, lp->tx_head, lp->tx_tail); + + if (lp->cfg.mode == LDC_MODE_RAW && + lp->chan_state == LDC_CHANNEL_UP) { + lp->hs_state = LDC_HS_COMPLETE; + ldc_set_state(lp, LDC_STATE_CONNECTED); + + event_mask |= LDC_EVENT_UP; + } + + spin_unlock_irqrestore(&lp->lock, flags); + + send_events(lp, event_mask); + + return IRQ_HANDLED; +} + +/* XXX ldc_alloc() and ldc_free() needs to run under a mutex so + * XXX that addition and removal from the ldc_channel_list has + * XXX atomicity, otherwise the __ldc_channel_exists() check is + * XXX totally pointless as another thread can slip into ldc_alloc() + * XXX and add a channel with the same ID. There also needs to be + * XXX a spinlock for ldc_channel_list. + */ +static HLIST_HEAD(ldc_channel_list); + +static int __ldc_channel_exists(unsigned long id) +{ + struct ldc_channel *lp; + struct hlist_node *n; + + hlist_for_each_entry(lp, n, &ldc_channel_list, list) { + if (lp->id == id) + return 1; + } + return 0; +} + +static int alloc_queue(const char *name, unsigned long num_entries, + struct ldc_packet **base, unsigned long *ra) +{ + unsigned long size, order; + void *q; + + size = num_entries * LDC_PACKET_SIZE; + order = get_order(size); + + q = (void *) __get_free_pages(GFP_KERNEL, order); + if (!q) { + printk(KERN_ERR PFX "Alloc of %s queue failed with " + "size=%lu order=%lu\n", name, size, order); + return -ENOMEM; + } + + memset(q, 0, PAGE_SIZE << order); + + *base = q; + *ra = __pa(q); + + return 0; +} + +static void free_queue(unsigned long num_entries, struct ldc_packet *q) +{ + unsigned long size, order; + + if (!q) + return; + + size = num_entries * LDC_PACKET_SIZE; + order = get_order(size); + + free_pages((unsigned long)q, order); +} + +/* XXX Make this configurable... XXX */ +#define LDC_IOTABLE_SIZE (8 * 1024) + +static int ldc_iommu_init(struct ldc_channel *lp) +{ + unsigned long sz, num_tsb_entries, tsbsize, order; + struct ldc_iommu *iommu = &lp->iommu; + struct ldc_mtable_entry *table; + unsigned long hv_err; + int err; + + num_tsb_entries = LDC_IOTABLE_SIZE; + tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry); + + spin_lock_init(&iommu->lock); + + sz = num_tsb_entries / 8; + sz = (sz + 7UL) & ~7UL; + iommu->arena.map = kzalloc(sz, GFP_KERNEL); + if (!iommu->arena.map) { + printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz); + return -ENOMEM; + } + + iommu->arena.limit = num_tsb_entries; + + order = get_order(tsbsize); + + table = (struct ldc_mtable_entry *) + __get_free_pages(GFP_KERNEL, order); + err = -ENOMEM; + if (!table) { + printk(KERN_ERR PFX "Alloc of MTE table failed, " + "size=%lu order=%lu\n", tsbsize, order); + goto out_free_map; + } + + memset(table, 0, PAGE_SIZE << order); + + iommu->page_table = table; + + hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table), + num_tsb_entries); + err = -EINVAL; + if (hv_err) + goto out_free_table; + + return 0; + +out_free_table: + free_pages((unsigned long) table, order); + iommu->page_table = NULL; + +out_free_map: + kfree(iommu->arena.map); + iommu->arena.map = NULL; + + return err; +} + +static void ldc_iommu_release(struct ldc_channel *lp) +{ + struct ldc_iommu *iommu = &lp->iommu; + unsigned long num_tsb_entries, tsbsize, order; + + (void) sun4v_ldc_set_map_table(lp->id, 0, 0); + + num_tsb_entries = iommu->arena.limit; + tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry); + order = get_order(tsbsize); + + free_pages((unsigned long) iommu->page_table, order); + iommu->page_table = NULL; + + kfree(iommu->arena.map); + iommu->arena.map = NULL; +} + +struct ldc_channel *ldc_alloc(unsigned long id, + const struct ldc_channel_config *cfgp, + void *event_arg) +{ + struct ldc_channel *lp; + const struct ldc_mode_ops *mops; + unsigned long dummy1, dummy2, hv_err; + u8 mss, *mssbuf; + int err; + + err = -ENODEV; + if (!ldom_domaining_enabled) + goto out_err; + + err = -EINVAL; + if (!cfgp) + goto out_err; + + switch (cfgp->mode) { + case LDC_MODE_RAW: + mops = &raw_ops; + mss = LDC_PACKET_SIZE; + break; + + case LDC_MODE_UNRELIABLE: + mops = &nonraw_ops; + mss = LDC_PACKET_SIZE - 8; + break; + + case LDC_MODE_STREAM: + mops = &stream_ops; + mss = LDC_PACKET_SIZE - 8 - 8; + break; + + default: + goto out_err; + } + + if (!cfgp->event || !event_arg || !cfgp->rx_irq || !cfgp->tx_irq) + goto out_err; + + hv_err = sun4v_ldc_tx_qinfo(id, &dummy1, &dummy2); + err = -ENODEV; + if (hv_err == HV_ECHANNEL) + goto out_err; + + err = -EEXIST; + if (__ldc_channel_exists(id)) + goto out_err; + + mssbuf = NULL; + + lp = kzalloc(sizeof(*lp), GFP_KERNEL); + err = -ENOMEM; + if (!lp) + goto out_err; + + spin_lock_init(&lp->lock); + + lp->id = id; + + err = ldc_iommu_init(lp); + if (err) + goto out_free_ldc; + + lp->mops = mops; + lp->mss = mss; + + lp->cfg = *cfgp; + if (!lp->cfg.mtu) + lp->cfg.mtu = LDC_DEFAULT_MTU; + + if (lp->cfg.mode == LDC_MODE_STREAM) { + mssbuf = kzalloc(lp->cfg.mtu, GFP_KERNEL); + if (!mssbuf) { + err = -ENOMEM; + goto out_free_iommu; + } + lp->mssbuf = mssbuf; + } + + lp->event_arg = event_arg; + + /* XXX allow setting via ldc_channel_config to override defaults + * XXX or use some formula based upon mtu + */ + lp->tx_num_entries = LDC_DEFAULT_NUM_ENTRIES; + lp->rx_num_entries = LDC_DEFAULT_NUM_ENTRIES; + + err = alloc_queue("TX", lp->tx_num_entries, + &lp->tx_base, &lp->tx_ra); + if (err) + goto out_free_mssbuf; + + err = alloc_queue("RX", lp->rx_num_entries, + &lp->rx_base, &lp->rx_ra); + if (err) + goto out_free_txq; + + lp->flags |= LDC_FLAG_ALLOCED_QUEUES; + + lp->hs_state = LDC_HS_CLOSED; + ldc_set_state(lp, LDC_STATE_INIT); + + INIT_HLIST_NODE(&lp->list); + hlist_add_head(&lp->list, &ldc_channel_list); + + INIT_HLIST_HEAD(&lp->mh_list); + + return lp; + +out_free_txq: + free_queue(lp->tx_num_entries, lp->tx_base); + +out_free_mssbuf: + if (mssbuf) + kfree(mssbuf); + +out_free_iommu: + ldc_iommu_release(lp); + +out_free_ldc: + kfree(lp); + +out_err: + return ERR_PTR(err); +} +EXPORT_SYMBOL(ldc_alloc); + +void ldc_free(struct ldc_channel *lp) +{ + if (lp->flags & LDC_FLAG_REGISTERED_IRQS) { + free_irq(lp->cfg.rx_irq, lp); + free_irq(lp->cfg.tx_irq, lp); + } + + if (lp->flags & LDC_FLAG_REGISTERED_QUEUES) { + sun4v_ldc_tx_qconf(lp->id, 0, 0); + sun4v_ldc_rx_qconf(lp->id, 0, 0); + lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES; + } + if (lp->flags & LDC_FLAG_ALLOCED_QUEUES) { + free_queue(lp->tx_num_entries, lp->tx_base); + free_queue(lp->rx_num_entries, lp->rx_base); + lp->flags &= ~LDC_FLAG_ALLOCED_QUEUES; + } + + hlist_del(&lp->list); + + if (lp->mssbuf) + kfree(lp->mssbuf); + + ldc_iommu_release(lp); + + kfree(lp); +} +EXPORT_SYMBOL(ldc_free); + +/* Bind the channel. This registers the LDC queues with + * the hypervisor and puts the channel into a pseudo-listening + * state. This does not initiate a handshake, ldc_connect() does + * that. + */ +int ldc_bind(struct ldc_channel *lp, const char *name) +{ + unsigned long hv_err, flags; + int err = -EINVAL; + + spin_lock_irqsave(&lp->lock, flags); + + if (!name) + goto out_err; + + if (lp->state != LDC_STATE_INIT) + goto out_err; + + snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name); + snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name); + + err = request_irq(lp->cfg.rx_irq, ldc_rx, + IRQF_SAMPLE_RANDOM | IRQF_SHARED, + lp->rx_irq_name, lp); + if (err) + goto out_err; + + err = request_irq(lp->cfg.tx_irq, ldc_tx, + IRQF_SAMPLE_RANDOM | IRQF_SHARED, + lp->tx_irq_name, lp); + if (err) + goto out_free_rx_irq; + + + lp->flags |= LDC_FLAG_REGISTERED_IRQS; + + err = -ENODEV; + hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0); + if (hv_err) + goto out_free_tx_irq; + + hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); + if (hv_err) + goto out_free_tx_irq; + + hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0); + if (hv_err) + goto out_unmap_tx; + + hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); + if (hv_err) + goto out_unmap_tx; + + lp->flags |= LDC_FLAG_REGISTERED_QUEUES; + + hv_err = sun4v_ldc_tx_get_state(lp->id, + &lp->tx_head, + &lp->tx_tail, + &lp->chan_state); + err = -EBUSY; + if (hv_err) + goto out_unmap_rx; + + lp->tx_acked = lp->tx_head; + + lp->hs_state = LDC_HS_OPEN; + ldc_set_state(lp, LDC_STATE_BOUND); + + spin_unlock_irqrestore(&lp->lock, flags); + + return 0; + +out_unmap_rx: + lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES; + sun4v_ldc_rx_qconf(lp->id, 0, 0); + +out_unmap_tx: + sun4v_ldc_tx_qconf(lp->id, 0, 0); + +out_free_tx_irq: + lp->flags &= ~LDC_FLAG_REGISTERED_IRQS; + free_irq(lp->cfg.tx_irq, lp); + +out_free_rx_irq: + free_irq(lp->cfg.rx_irq, lp); + +out_err: + spin_unlock_irqrestore(&lp->lock, flags); + + return err; +} +EXPORT_SYMBOL(ldc_bind); + +int ldc_connect(struct ldc_channel *lp) +{ + unsigned long flags; + int err; + + if (lp->cfg.mode == LDC_MODE_RAW) + return -EINVAL; + + spin_lock_irqsave(&lp->lock, flags); + + if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) || + !(lp->flags & LDC_FLAG_REGISTERED_QUEUES) || + lp->hs_state != LDC_HS_OPEN) + err = -EINVAL; + else + err = start_handshake(lp); + + spin_unlock_irqrestore(&lp->lock, flags); + + return err; +} +EXPORT_SYMBOL(ldc_connect); + +int ldc_disconnect(struct ldc_channel *lp) +{ + unsigned long hv_err, flags; + int err; + + if (lp->cfg.mode == LDC_MODE_RAW) + return -EINVAL; + + if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) || + !(lp->flags & LDC_FLAG_REGISTERED_QUEUES)) + return -EINVAL; + + spin_lock_irqsave(&lp->lock, flags); + + err = -ENODEV; + hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0); + if (hv_err) + goto out_err; + + hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries); + if (hv_err) + goto out_err; + + hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0); + if (hv_err) + goto out_err; + + hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries); + if (hv_err) + goto out_err; + + ldc_set_state(lp, LDC_STATE_BOUND); + lp->hs_state = LDC_HS_OPEN; + lp->flags |= LDC_FLAG_RESET; + + spin_unlock_irqrestore(&lp->lock, flags); + + return 0; + +out_err: + sun4v_ldc_tx_qconf(lp->id, 0, 0); + sun4v_ldc_rx_qconf(lp->id, 0, 0); + free_irq(lp->cfg.tx_irq, lp); + free_irq(lp->cfg.rx_irq, lp); + lp->flags &= ~(LDC_FLAG_REGISTERED_IRQS | + LDC_FLAG_REGISTERED_QUEUES); + ldc_set_state(lp, LDC_STATE_INIT); + + spin_unlock_irqrestore(&lp->lock, flags); + + return err; +} +EXPORT_SYMBOL(ldc_disconnect); + +int ldc_state(struct ldc_channel *lp) +{ + return lp->state; +} +EXPORT_SYMBOL(ldc_state); + +static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size) +{ + struct ldc_packet *p; + unsigned long new_tail; + int err; + + if (size > LDC_PACKET_SIZE) + return -EMSGSIZE; + + p = data_get_tx_packet(lp, &new_tail); + if (!p) + return -EAGAIN; + + memcpy(p, buf, size); + + err = send_tx_packet(lp, p, new_tail); + if (!err) + err = size; + + return err; +} + +static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size) +{ + struct ldc_packet *p; + unsigned long hv_err, new; + int err; + + if (size < LDC_PACKET_SIZE) + return -EINVAL; + + hv_err = sun4v_ldc_rx_get_state(lp->id, + &lp->rx_head, + &lp->rx_tail, + &lp->chan_state); + if (hv_err) + return ldc_abort(lp); + + if (lp->chan_state == LDC_CHANNEL_DOWN || + lp->chan_state == LDC_CHANNEL_RESETTING) + return -ECONNRESET; + + if (lp->rx_head == lp->rx_tail) + return 0; + + p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE); + memcpy(buf, p, LDC_PACKET_SIZE); + + new = rx_advance(lp, lp->rx_head); + lp->rx_head = new; + + err = __set_rx_head(lp, new); + if (err < 0) + err = -ECONNRESET; + else + err = LDC_PACKET_SIZE; + + return err; +} + +static const struct ldc_mode_ops raw_ops = { + .write = write_raw, + .read = read_raw, +}; + +static int write_nonraw(struct ldc_channel *lp, const void *buf, + unsigned int size) +{ + unsigned long hv_err, tail; + unsigned int copied; + u32 seq; + int err; + + hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail, + &lp->chan_state); + if (unlikely(hv_err)) + return -EBUSY; + + if (unlikely(lp->chan_state != LDC_CHANNEL_UP)) + return ldc_abort(lp); + + if (!tx_has_space_for(lp, size)) + return -EAGAIN; + + seq = lp->snd_nxt; + copied = 0; + tail = lp->tx_tail; + while (copied < size) { + struct ldc_packet *p = lp->tx_base + (tail / LDC_PACKET_SIZE); + u8 *data = ((lp->cfg.mode == LDC_MODE_UNRELIABLE) ? + p->u.u_data : + p->u.r.r_data); + int data_len; + + p->type = LDC_DATA; + p->stype = LDC_INFO; + p->ctrl = 0; + + data_len = size - copied; + if (data_len > lp->mss) + data_len = lp->mss; + + BUG_ON(data_len > LDC_LEN); + + p->env = (data_len | + (copied == 0 ? LDC_START : 0) | + (data_len == size - copied ? LDC_STOP : 0)); + + p->seqid = ++seq; + + ldcdbg(DATA, "SENT DATA [%02x:%02x:%02x:%02x:%08x]\n", + p->type, + p->stype, + p->ctrl, + p->env, + p->seqid); + + memcpy(data, buf, data_len); + buf += data_len; + copied += data_len; + + tail = tx_advance(lp, tail); + } + + err = set_tx_tail(lp, tail); + if (!err) { + lp->snd_nxt = seq; + err = size; + } + + return err; +} + +static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p, + struct ldc_packet *first_frag) +{ + int err; + + if (first_frag) + lp->rcv_nxt = first_frag->seqid - 1; + + err = send_data_nack(lp, p); + if (err) + return err; + + err = __set_rx_head(lp, lp->rx_tail); + if (err < 0) + return ldc_abort(lp); + + return 0; +} + +static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p) +{ + if (p->stype & LDC_ACK) { + int err = process_data_ack(lp, p); + if (err) + return err; + } + if (p->stype & LDC_NACK) + return ldc_abort(lp); + + return 0; +} + +static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head) +{ + unsigned long dummy; + int limit = 1000; + + ldcdbg(DATA, "DATA WAIT cur_head[%lx] rx_head[%lx] rx_tail[%lx]\n", + cur_head, lp->rx_head, lp->rx_tail); + while (limit-- > 0) { + unsigned long hv_err; + + hv_err = sun4v_ldc_rx_get_state(lp->id, + &dummy, + &lp->rx_tail, + &lp->chan_state); + if (hv_err) + return ldc_abort(lp); + + if (lp->chan_state == LDC_CHANNEL_DOWN || + lp->chan_state == LDC_CHANNEL_RESETTING) + return -ECONNRESET; + + if (cur_head != lp->rx_tail) { + ldcdbg(DATA, "DATA WAIT DONE " + "head[%lx] tail[%lx] chan_state[%lx]\n", + dummy, lp->rx_tail, lp->chan_state); + return 0; + } + + udelay(1); + } + return -EAGAIN; +} + +static int rx_set_head(struct ldc_channel *lp, unsigned long head) +{ + int err = __set_rx_head(lp, head); + + if (err < 0) + return ldc_abort(lp); + + lp->rx_head = head; + return 0; +} + +static void send_data_ack(struct ldc_channel *lp) +{ + unsigned long new_tail; + struct ldc_packet *p; + + p = data_get_tx_packet(lp, &new_tail); + if (likely(p)) { + int err; + + memset(p, 0, sizeof(*p)); + p->type = LDC_DATA; + p->stype = LDC_ACK; + p->ctrl = 0; + p->seqid = lp->snd_nxt + 1; + p->u.r.ackid = lp->rcv_nxt; + + err = send_tx_packet(lp, p, new_tail); + if (!err) + lp->snd_nxt++; + } +} + +static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size) +{ + struct ldc_packet *first_frag; + unsigned long hv_err, new; + int err, copied; + + hv_err = sun4v_ldc_rx_get_state(lp->id, + &lp->rx_head, + &lp->rx_tail, + &lp->chan_state); + if (hv_err) + return ldc_abort(lp); + + if (lp->chan_state == LDC_CHANNEL_DOWN || + lp->chan_state == LDC_CHANNEL_RESETTING) + return -ECONNRESET; + + if (lp->rx_head == lp->rx_tail) + return 0; + + first_frag = NULL; + copied = err = 0; + new = lp->rx_head; + while (1) { + struct ldc_packet *p; + int pkt_len; + + BUG_ON(new == lp->rx_tail); + p = lp->rx_base + (new / LDC_PACKET_SIZE); + + ldcdbg(RX, "RX read pkt[%02x:%02x:%02x:%02x:%08x:%08x] " + "rcv_nxt[%08x]\n", + p->type, + p->stype, + p->ctrl, + p->env, + p->seqid, + p->u.r.ackid, + lp->rcv_nxt); + + if (unlikely(!rx_seq_ok(lp, p->seqid))) { + err = rx_bad_seq(lp, p, first_frag); + copied = 0; + break; + } + + if (p->type & LDC_CTRL) { + err = process_control_frame(lp, p); + if (err < 0) + break; + err = 0; + } + + lp->rcv_nxt = p->seqid; + + if (!(p->type & LDC_DATA)) { + new = rx_advance(lp, new); + goto no_data; + } + if (p->stype & (LDC_ACK | LDC_NACK)) { + err = data_ack_nack(lp, p); + if (err) + break; + } + if (!(p->stype & LDC_INFO)) { + new = rx_advance(lp, new); + err = rx_set_head(lp, new); + if (err) + break; + goto no_data; + } + + pkt_len = p->env & LDC_LEN; + + /* Every initial packet starts with the START bit set. + * + * Singleton packets will have both START+STOP set. + * + * Fragments will have START set in the first frame, STOP + * set in the last frame, and neither bit set in middle + * frames of the packet. + * + * Therefore if we are at the beginning of a packet and + * we don't see START, or we are in the middle of a fragmented + * packet and do see START, we are unsynchronized and should + * flush the RX queue. + */ + if ((first_frag == NULL && !(p->env & LDC_START)) || + (first_frag != NULL && (p->env & LDC_START))) { + if (!first_frag) + new = rx_advance(lp, new); + + err = rx_set_head(lp, new); + if (err) + break; + + if (!first_frag) + goto no_data; + } + if (!first_frag) + first_frag = p; + + if (pkt_len > size - copied) { + /* User didn't give us a big enough buffer, + * what to do? This is a pretty serious error. + * + * Since we haven't updated the RX ring head to + * consume any of the packets, signal the error + * to the user and just leave the RX ring alone. + * + * This seems the best behavior because this allows + * a user of the LDC layer to start with a small + * RX buffer for ldc_read() calls and use -EMSGSIZE + * as a cue to enlarge it's read buffer. + */ + err = -EMSGSIZE; + break; + } + + /* Ok, we are gonna eat this one. */ + new = rx_advance(lp, new); + + memcpy(buf, + (lp->cfg.mode == LDC_MODE_UNRELIABLE ? + p->u.u_data : p->u.r.r_data), pkt_len); + buf += pkt_len; + copied += pkt_len; + + if (p->env & LDC_STOP) + break; + +no_data: + if (new == lp->rx_tail) { + err = rx_data_wait(lp, new); + if (err) + break; + } + } + + if (!err) + err = rx_set_head(lp, new); + + if (err && first_frag) + lp->rcv_nxt = first_frag->seqid - 1; + + if (!err) { + err = copied; + if (err > 0 && lp->cfg.mode != LDC_MODE_UNRELIABLE) + send_data_ack(lp); + } + + return err; +} + +static const struct ldc_mode_ops nonraw_ops = { + .write = write_nonraw, + .read = read_nonraw, +}; + +static int write_stream(struct ldc_channel *lp, const void *buf, + unsigned int size) +{ + if (size > lp->cfg.mtu) + size = lp->cfg.mtu; + return write_nonraw(lp, buf, size); +} + +static int read_stream(struct ldc_channel *lp, void *buf, unsigned int size) +{ + if (!lp->mssbuf_len) { + int err = read_nonraw(lp, lp->mssbuf, lp->cfg.mtu); + if (err < 0) + return err; + + lp->mssbuf_len = err; + lp->mssbuf_off = 0; + } + + if (size > lp->mssbuf_len) + size = lp->mssbuf_len; + memcpy(buf, lp->mssbuf + lp->mssbuf_off, size); + + lp->mssbuf_off += size; + lp->mssbuf_len -= size; + + return size; +} + +static const struct ldc_mode_ops stream_ops = { + .write = write_stream, + .read = read_stream, +}; + +int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size) +{ + unsigned long flags; + int err; + + if (!buf) + return -EINVAL; + + if (!size) + return 0; + + spin_lock_irqsave(&lp->lock, flags); + + if (lp->hs_state != LDC_HS_COMPLETE) + err = -ENOTCONN; + else + err = lp->mops->write(lp, buf, size); + + spin_unlock_irqrestore(&lp->lock, flags); + + return err; +} +EXPORT_SYMBOL(ldc_write); + +int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size) +{ + unsigned long flags; + int err; + + if (!buf) + return -EINVAL; + + if (!size) + return 0; + + spin_lock_irqsave(&lp->lock, flags); + + if (lp->hs_state != LDC_HS_COMPLETE) + err = -ENOTCONN; + else + err = lp->mops->read(lp, buf, size); + + spin_unlock_irqrestore(&lp->lock, flags); + + return err; +} +EXPORT_SYMBOL(ldc_read); + +static long arena_alloc(struct ldc_iommu *iommu, unsigned long npages) +{ + struct iommu_arena *arena = &iommu->arena; + unsigned long n, i, start, end, limit; + int pass; + + limit = arena->limit; + start = arena->hint; + pass = 0; + +again: + n = find_next_zero_bit(arena->map, limit, start); + end = n + npages; + if (unlikely(end >= limit)) { + if (likely(pass < 1)) { + limit = start; + start = 0; + pass++; + goto again; + } else { + /* Scanned the whole thing, give up. */ + return -1; + } + } + + for (i = n; i < end; i++) { + if (test_bit(i, arena->map)) { + start = i + 1; + goto again; + } + } + + for (i = n; i < end; i++) + __set_bit(i, arena->map); + + arena->hint = end; + + return n; +} + +#define COOKIE_PGSZ_CODE 0xf000000000000000ULL +#define COOKIE_PGSZ_CODE_SHIFT 60ULL + +static u64 pagesize_code(void) +{ + switch (PAGE_SIZE) { + default: + case (8ULL * 1024ULL): + return 0; + case (64ULL * 1024ULL): + return 1; + case (512ULL * 1024ULL): + return 2; + case (4ULL * 1024ULL * 1024ULL): + return 3; + case (32ULL * 1024ULL * 1024ULL): + return 4; + case (256ULL * 1024ULL * 1024ULL): + return 5; + } +} + +static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset) +{ + return ((pgsz_code << COOKIE_PGSZ_CODE_SHIFT) | + (index << PAGE_SHIFT) | + page_offset); +} + +static u64 cookie_to_index(u64 cookie, unsigned long *shift) +{ + u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT; + + cookie &= ~COOKIE_PGSZ_CODE; + + *shift = szcode * 3; + + return (cookie >> (13ULL + (szcode * 3ULL))); +} + +static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu, + unsigned long npages) +{ + long entry; + + entry = arena_alloc(iommu, npages); + if (unlikely(entry < 0)) + return NULL; + + return iommu->page_table + entry; +} + +static u64 perm_to_mte(unsigned int map_perm) +{ + u64 mte_base; + + mte_base = pagesize_code(); + + if (map_perm & LDC_MAP_SHADOW) { + if (map_perm & LDC_MAP_R) + mte_base |= LDC_MTE_COPY_R; + if (map_perm & LDC_MAP_W) + mte_base |= LDC_MTE_COPY_W; + } + if (map_perm & LDC_MAP_DIRECT) { + if (map_perm & LDC_MAP_R) + mte_base |= LDC_MTE_READ; + if (map_perm & LDC_MAP_W) + mte_base |= LDC_MTE_WRITE; + if (map_perm & LDC_MAP_X) + mte_base |= LDC_MTE_EXEC; + } + if (map_perm & LDC_MAP_IO) { + if (map_perm & LDC_MAP_R) + mte_base |= LDC_MTE_IOMMU_R; + if (map_perm & LDC_MAP_W) + mte_base |= LDC_MTE_IOMMU_W; + } + + return mte_base; +} + +static int pages_in_region(unsigned long base, long len) +{ + int count = 0; + + do { + unsigned long new = (base + PAGE_SIZE) & PAGE_MASK; + + len -= (new - base); + base = new; + count++; + } while (len > 0); + + return count; +} + +struct cookie_state { + struct ldc_mtable_entry *page_table; + struct ldc_trans_cookie *cookies; + u64 mte_base; + u64 prev_cookie; + u32 pte_idx; + u32 nc; +}; + +static void fill_cookies(struct cookie_state *sp, unsigned long pa, + unsigned long off, unsigned long len) +{ + do { + unsigned long tlen, new = pa + PAGE_SIZE; + u64 this_cookie; + + sp->page_table[sp->pte_idx].mte = sp->mte_base | pa; + + tlen = PAGE_SIZE; + if (off) + tlen = PAGE_SIZE - off; + if (tlen > len) + tlen = len; + + this_cookie = make_cookie(sp->pte_idx, + pagesize_code(), off); + + off = 0; + + if (this_cookie == sp->prev_cookie) { + sp->cookies[sp->nc - 1].cookie_size += tlen; + } else { + sp->cookies[sp->nc].cookie_addr = this_cookie; + sp->cookies[sp->nc].cookie_size = tlen; + sp->nc++; + } + sp->prev_cookie = this_cookie + tlen; + + sp->pte_idx++; + + len -= tlen; + pa = new; + } while (len > 0); +} + +static int sg_count_one(struct scatterlist *sg) +{ + unsigned long base = page_to_pfn(sg->page) << PAGE_SHIFT; + long len = sg->length; + + if ((sg->offset | len) & (8UL - 1)) + return -EFAULT; + + return pages_in_region(base + sg->offset, len); +} + +static int sg_count_pages(struct scatterlist *sg, int num_sg) +{ + int count; + int i; + + count = 0; + for (i = 0; i < num_sg; i++) { + int err = sg_count_one(sg + i); + if (err < 0) + return err; + count += err; + } + + return count; +} + +int ldc_map_sg(struct ldc_channel *lp, + struct scatterlist *sg, int num_sg, + struct ldc_trans_cookie *cookies, int ncookies, + unsigned int map_perm) +{ + unsigned long i, npages, flags; + struct ldc_mtable_entry *base; + struct cookie_state state; + struct ldc_iommu *iommu; + int err; + + if (map_perm & ~LDC_MAP_ALL) + return -EINVAL; + + err = sg_count_pages(sg, num_sg); + if (err < 0) + return err; + + npages = err; + if (err > ncookies) + return -EMSGSIZE; + + iommu = &lp->iommu; + + spin_lock_irqsave(&iommu->lock, flags); + base = alloc_npages(iommu, npages); + spin_unlock_irqrestore(&iommu->lock, flags); + + if (!base) + return -ENOMEM; + + state.page_table = iommu->page_table; + state.cookies = cookies; + state.mte_base = perm_to_mte(map_perm); + state.prev_cookie = ~(u64)0; + state.pte_idx = (base - iommu->page_table); + state.nc = 0; + + for (i = 0; i < num_sg; i++) + fill_cookies(&state, page_to_pfn(sg[i].page) << PAGE_SHIFT, + sg[i].offset, sg[i].length); + + return state.nc; +} +EXPORT_SYMBOL(ldc_map_sg); + +int ldc_map_single(struct ldc_channel *lp, + void *buf, unsigned int len, + struct ldc_trans_cookie *cookies, int ncookies, + unsigned int map_perm) +{ + unsigned long npages, pa, flags; + struct ldc_mtable_entry *base; + struct cookie_state state; + struct ldc_iommu *iommu; + + if ((map_perm & ~LDC_MAP_ALL) || (ncookies < 1)) + return -EINVAL; + + pa = __pa(buf); + if ((pa | len) & (8UL - 1)) + return -EFAULT; + + npages = pages_in_region(pa, len); + + iommu = &lp->iommu; + + spin_lock_irqsave(&iommu->lock, flags); + base = alloc_npages(iommu, npages); + spin_unlock_irqrestore(&iommu->lock, flags); + + if (!base) + return -ENOMEM; + + state.page_table = iommu->page_table; + state.cookies = cookies; + state.mte_base = perm_to_mte(map_perm); + state.prev_cookie = ~(u64)0; + state.pte_idx = (base - iommu->page_table); + state.nc = 0; + fill_cookies(&state, (pa & PAGE_MASK), (pa & ~PAGE_MASK), len); + BUG_ON(state.nc != 1); + + return state.nc; +} +EXPORT_SYMBOL(ldc_map_single); + +static void free_npages(unsigned long id, struct ldc_iommu *iommu, + u64 cookie, u64 size) +{ + struct iommu_arena *arena = &iommu->arena; + unsigned long i, shift, index, npages; + struct ldc_mtable_entry *base; + + npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT; + index = cookie_to_index(cookie, &shift); + base = iommu->page_table + index; + + BUG_ON(index > arena->limit || + (index + npages) > arena->limit); + + for (i = 0; i < npages; i++) { + if (base->cookie) + sun4v_ldc_revoke(id, cookie + (i << shift), + base->cookie); + base->mte = 0; + __clear_bit(index + i, arena->map); + } +} + +void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies, + int ncookies) +{ + struct ldc_iommu *iommu = &lp->iommu; + unsigned long flags; + int i; + + spin_lock_irqsave(&iommu->lock, flags); + for (i = 0; i < ncookies; i++) { + u64 addr = cookies[i].cookie_addr; + u64 size = cookies[i].cookie_size; + + free_npages(lp->id, iommu, addr, size); + } + spin_unlock_irqrestore(&iommu->lock, flags); +} +EXPORT_SYMBOL(ldc_unmap); + +int ldc_copy(struct ldc_channel *lp, int copy_dir, + void *buf, unsigned int len, unsigned long offset, + struct ldc_trans_cookie *cookies, int ncookies) +{ + unsigned int orig_len; + unsigned long ra; + int i; + + if (copy_dir != LDC_COPY_IN && copy_dir != LDC_COPY_OUT) { + printk(KERN_ERR PFX "ldc_copy: ID[%lu] Bad copy_dir[%d]\n", + lp->id, copy_dir); + return -EINVAL; + } + + ra = __pa(buf); + if ((ra | len | offset) & (8UL - 1)) { + printk(KERN_ERR PFX "ldc_copy: ID[%lu] Unaligned buffer " + "ra[%lx] len[%x] offset[%lx]\n", + lp->id, ra, len, offset); + return -EFAULT; + } + + if (lp->hs_state != LDC_HS_COMPLETE || + (lp->flags & LDC_FLAG_RESET)) { + printk(KERN_ERR PFX "ldc_copy: ID[%lu] Link down hs_state[%x] " + "flags[%x]\n", lp->id, lp->hs_state, lp->flags); + return -ECONNRESET; + } + + orig_len = len; + for (i = 0; i < ncookies; i++) { + unsigned long cookie_raddr = cookies[i].cookie_addr; + unsigned long this_len = cookies[i].cookie_size; + unsigned long actual_len; + + if (unlikely(offset)) { + unsigned long this_off = offset; + + if (this_off > this_len) + this_off = this_len; + + offset -= this_off; + this_len -= this_off; + if (!this_len) + continue; + cookie_raddr += this_off; + } + + if (this_len > len) + this_len = len; + + while (1) { + unsigned long hv_err; + + hv_err = sun4v_ldc_copy(lp->id, copy_dir, + cookie_raddr, ra, + this_len, &actual_len); + if (unlikely(hv_err)) { + printk(KERN_ERR PFX "ldc_copy: ID[%lu] " + "HV error %lu\n", + lp->id, hv_err); + if (lp->hs_state != LDC_HS_COMPLETE || + (lp->flags & LDC_FLAG_RESET)) + return -ECONNRESET; + else + return -EFAULT; + } + + cookie_raddr += actual_len; + ra += actual_len; + len -= actual_len; + if (actual_len == this_len) + break; + + this_len -= actual_len; + } + + if (!len) + break; + } + + /* It is caller policy what to do about short copies. + * For example, a networking driver can declare the + * packet a runt and drop it. + */ + + return orig_len - len; +} +EXPORT_SYMBOL(ldc_copy); + +void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len, + struct ldc_trans_cookie *cookies, int *ncookies, + unsigned int map_perm) +{ + void *buf; + int err; + + if (len & (8UL - 1)) + return ERR_PTR(-EINVAL); + + buf = kzalloc(len, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + err = ldc_map_single(lp, buf, len, cookies, *ncookies, map_perm); + if (err < 0) { + kfree(buf); + return ERR_PTR(err); + } + *ncookies = err; + + return buf; +} +EXPORT_SYMBOL(ldc_alloc_exp_dring); + +void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, unsigned int len, + struct ldc_trans_cookie *cookies, int ncookies) +{ + ldc_unmap(lp, cookies, ncookies); + kfree(buf); +} +EXPORT_SYMBOL(ldc_free_exp_dring); + +static int __init ldc_init(void) +{ + unsigned long major, minor; + struct mdesc_handle *hp; + const u64 *v; + u64 mp; + + hp = mdesc_grab(); + if (!hp) + return -ENODEV; + + mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform"); + if (mp == MDESC_NODE_NULL) + return -ENODEV; + + v = mdesc_get_property(hp, mp, "domaining-enabled", NULL); + if (!v) + return -ENODEV; + + major = 1; + minor = 0; + if (sun4v_hvapi_register(HV_GRP_LDOM, major, &minor)) { + printk(KERN_INFO PFX "Could not register LDOM hvapi.\n"); + return -ENODEV; + } + + printk(KERN_INFO "%s", version); + + if (!*v) { + printk(KERN_INFO PFX "Domaining disabled.\n"); + return -ENODEV; + } + ldom_domaining_enabled = 1; + + return 0; +} + +core_initcall(ldc_init); diff --git a/arch/sparc64/kernel/mdesc.c b/arch/sparc64/kernel/mdesc.c index f0e16045fb16..62a389793949 100644 --- a/arch/sparc64/kernel/mdesc.c +++ b/arch/sparc64/kernel/mdesc.c @@ -6,6 +6,9 @@ #include <linux/types.h> #include <linux/bootmem.h> #include <linux/log2.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/mm.h> #include <asm/hypervisor.h> #include <asm/mdesc.h> @@ -29,7 +32,7 @@ struct mdesc_hdr { u32 node_sz; /* node block size */ u32 name_sz; /* name block size */ u32 data_sz; /* data block size */ -}; +} __attribute__((aligned(16))); struct mdesc_elem { u8 tag; @@ -53,306 +56,402 @@ struct mdesc_elem { } d; }; -static struct mdesc_hdr *main_mdesc; -static struct mdesc_node *allnodes; - -static struct mdesc_node *allnodes_tail; -static unsigned int unique_id; +struct mdesc_mem_ops { + struct mdesc_handle *(*alloc)(unsigned int mdesc_size); + void (*free)(struct mdesc_handle *handle); +}; -static struct mdesc_node **mdesc_hash; -static unsigned int mdesc_hash_size; +struct mdesc_handle { + struct list_head list; + struct mdesc_mem_ops *mops; + void *self_base; + atomic_t refcnt; + unsigned int handle_size; + struct mdesc_hdr mdesc; +}; -static inline unsigned int node_hashfn(u64 node) +static void mdesc_handle_init(struct mdesc_handle *hp, + unsigned int handle_size, + void *base) { - return ((unsigned int) (node ^ (node >> 8) ^ (node >> 16))) - & (mdesc_hash_size - 1); + BUG_ON(((unsigned long)&hp->mdesc) & (16UL - 1)); + + memset(hp, 0, handle_size); + INIT_LIST_HEAD(&hp->list); + hp->self_base = base; + atomic_set(&hp->refcnt, 1); + hp->handle_size = handle_size; } -static inline void hash_node(struct mdesc_node *mp) +static struct mdesc_handle *mdesc_bootmem_alloc(unsigned int mdesc_size) { - struct mdesc_node **head = &mdesc_hash[node_hashfn(mp->node)]; + struct mdesc_handle *hp; + unsigned int handle_size, alloc_size; - mp->hash_next = *head; - *head = mp; + handle_size = (sizeof(struct mdesc_handle) - + sizeof(struct mdesc_hdr) + + mdesc_size); + alloc_size = PAGE_ALIGN(handle_size); - if (allnodes_tail) { - allnodes_tail->allnodes_next = mp; - allnodes_tail = mp; - } else { - allnodes = allnodes_tail = mp; - } + hp = __alloc_bootmem(alloc_size, PAGE_SIZE, 0UL); + if (hp) + mdesc_handle_init(hp, handle_size, hp); + + return hp; } -static struct mdesc_node *find_node(u64 node) +static void mdesc_bootmem_free(struct mdesc_handle *hp) { - struct mdesc_node *mp = mdesc_hash[node_hashfn(node)]; + unsigned int alloc_size, handle_size = hp->handle_size; + unsigned long start, end; + + BUG_ON(atomic_read(&hp->refcnt) != 0); + BUG_ON(!list_empty(&hp->list)); - while (mp) { - if (mp->node == node) - return mp; + alloc_size = PAGE_ALIGN(handle_size); - mp = mp->hash_next; + start = (unsigned long) hp; + end = start + alloc_size; + + while (start < end) { + struct page *p; + + p = virt_to_page(start); + ClearPageReserved(p); + __free_page(p); + start += PAGE_SIZE; } - return NULL; } -struct property *md_find_property(const struct mdesc_node *mp, - const char *name, - int *lenp) +static struct mdesc_mem_ops bootmem_mdesc_memops = { + .alloc = mdesc_bootmem_alloc, + .free = mdesc_bootmem_free, +}; + +static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size) { - struct property *pp; + unsigned int handle_size; + void *base; - for (pp = mp->properties; pp != 0; pp = pp->next) { - if (strcasecmp(pp->name, name) == 0) { - if (lenp) - *lenp = pp->length; - break; - } + handle_size = (sizeof(struct mdesc_handle) - + sizeof(struct mdesc_hdr) + + mdesc_size); + + base = kmalloc(handle_size + 15, GFP_KERNEL); + if (base) { + struct mdesc_handle *hp; + unsigned long addr; + + addr = (unsigned long)base; + addr = (addr + 15UL) & ~15UL; + hp = (struct mdesc_handle *) addr; + + mdesc_handle_init(hp, handle_size, base); + return hp; } - return pp; + + return NULL; } -EXPORT_SYMBOL(md_find_property); -/* - * Find a property with a given name for a given node - * and return the value. - */ -const void *md_get_property(const struct mdesc_node *mp, const char *name, - int *lenp) +static void mdesc_kfree(struct mdesc_handle *hp) { - struct property *pp = md_find_property(mp, name, lenp); - return pp ? pp->value : NULL; + BUG_ON(atomic_read(&hp->refcnt) != 0); + BUG_ON(!list_empty(&hp->list)); + + kfree(hp->self_base); } -EXPORT_SYMBOL(md_get_property); -struct mdesc_node *md_find_node_by_name(struct mdesc_node *from, - const char *name) +static struct mdesc_mem_ops kmalloc_mdesc_memops = { + .alloc = mdesc_kmalloc, + .free = mdesc_kfree, +}; + +static struct mdesc_handle *mdesc_alloc(unsigned int mdesc_size, + struct mdesc_mem_ops *mops) { - struct mdesc_node *mp; + struct mdesc_handle *hp = mops->alloc(mdesc_size); - mp = from ? from->allnodes_next : allnodes; - for (; mp != NULL; mp = mp->allnodes_next) { - if (strcmp(mp->name, name) == 0) - break; - } - return mp; -} -EXPORT_SYMBOL(md_find_node_by_name); + if (hp) + hp->mops = mops; -static unsigned int mdesc_early_allocated; + return hp; +} -static void * __init mdesc_early_alloc(unsigned long size) +static void mdesc_free(struct mdesc_handle *hp) { - void *ret; + hp->mops->free(hp); +} - ret = __alloc_bootmem(size, SMP_CACHE_BYTES, 0UL); - if (ret == NULL) { - prom_printf("MDESC: alloc of %lu bytes failed.\n", size); - prom_halt(); - } +static struct mdesc_handle *cur_mdesc; +static LIST_HEAD(mdesc_zombie_list); +static DEFINE_SPINLOCK(mdesc_lock); - memset(ret, 0, size); +struct mdesc_handle *mdesc_grab(void) +{ + struct mdesc_handle *hp; + unsigned long flags; - mdesc_early_allocated += size; + spin_lock_irqsave(&mdesc_lock, flags); + hp = cur_mdesc; + if (hp) + atomic_inc(&hp->refcnt); + spin_unlock_irqrestore(&mdesc_lock, flags); - return ret; + return hp; } +EXPORT_SYMBOL(mdesc_grab); -static unsigned int __init count_arcs(struct mdesc_elem *ep) +void mdesc_release(struct mdesc_handle *hp) { - unsigned int ret = 0; + unsigned long flags; - ep++; - while (ep->tag != MD_NODE_END) { - if (ep->tag == MD_PROP_ARC) - ret++; - ep++; + spin_lock_irqsave(&mdesc_lock, flags); + if (atomic_dec_and_test(&hp->refcnt)) { + list_del_init(&hp->list); + hp->mops->free(hp); } - return ret; + spin_unlock_irqrestore(&mdesc_lock, flags); } +EXPORT_SYMBOL(mdesc_release); -static void __init mdesc_node_alloc(u64 node, struct mdesc_elem *ep, const char *names) +static void do_mdesc_update(struct work_struct *work) { - unsigned int num_arcs = count_arcs(ep); - struct mdesc_node *mp; + unsigned long len, real_len, status; + struct mdesc_handle *hp, *orig_hp; + unsigned long flags; + + (void) sun4v_mach_desc(0UL, 0UL, &len); + + hp = mdesc_alloc(len, &kmalloc_mdesc_memops); + if (!hp) { + printk(KERN_ERR "MD: mdesc alloc fails\n"); + return; + } + + status = sun4v_mach_desc(__pa(&hp->mdesc), len, &real_len); + if (status != HV_EOK || real_len > len) { + printk(KERN_ERR "MD: mdesc reread fails with %lu\n", + status); + atomic_dec(&hp->refcnt); + mdesc_free(hp); + return; + } - mp = mdesc_early_alloc(sizeof(*mp) + - (num_arcs * sizeof(struct mdesc_arc))); - mp->name = names + ep->name_offset; - mp->node = node; - mp->unique_id = unique_id++; - mp->num_arcs = num_arcs; + spin_lock_irqsave(&mdesc_lock, flags); + orig_hp = cur_mdesc; + cur_mdesc = hp; - hash_node(mp); + if (atomic_dec_and_test(&orig_hp->refcnt)) + mdesc_free(orig_hp); + else + list_add(&orig_hp->list, &mdesc_zombie_list); + spin_unlock_irqrestore(&mdesc_lock, flags); } -static inline struct mdesc_elem *node_block(struct mdesc_hdr *mdesc) +static DECLARE_WORK(mdesc_update_work, do_mdesc_update); + +void mdesc_update(void) +{ + schedule_work(&mdesc_update_work); +} + +static struct mdesc_elem *node_block(struct mdesc_hdr *mdesc) { return (struct mdesc_elem *) (mdesc + 1); } -static inline void *name_block(struct mdesc_hdr *mdesc) +static void *name_block(struct mdesc_hdr *mdesc) { return ((void *) node_block(mdesc)) + mdesc->node_sz; } -static inline void *data_block(struct mdesc_hdr *mdesc) +static void *data_block(struct mdesc_hdr *mdesc) { return ((void *) name_block(mdesc)) + mdesc->name_sz; } -/* In order to avoid recursion (the graph can be very deep) we use a - * two pass algorithm. First we allocate all the nodes and hash them. - * Then we iterate over each node, filling in the arcs and properties. - */ -static void __init build_all_nodes(struct mdesc_hdr *mdesc) +u64 mdesc_node_by_name(struct mdesc_handle *hp, + u64 from_node, const char *name) { - struct mdesc_elem *start, *ep; - struct mdesc_node *mp; - const char *names; - void *data; - u64 last_node; - - start = ep = node_block(mdesc); - last_node = mdesc->node_sz / 16; + struct mdesc_elem *ep = node_block(&hp->mdesc); + const char *names = name_block(&hp->mdesc); + u64 last_node = hp->mdesc.node_sz / 16; + u64 ret; + + if (from_node == MDESC_NODE_NULL) + from_node = 0; + + if (from_node >= last_node) + return MDESC_NODE_NULL; + + ret = ep[from_node].d.val; + while (ret < last_node) { + if (ep[ret].tag != MD_NODE) + return MDESC_NODE_NULL; + if (!strcmp(names + ep[ret].name_offset, name)) + break; + ret = ep[ret].d.val; + } + if (ret >= last_node) + ret = MDESC_NODE_NULL; + return ret; +} +EXPORT_SYMBOL(mdesc_node_by_name); - names = name_block(mdesc); +const void *mdesc_get_property(struct mdesc_handle *hp, u64 node, + const char *name, int *lenp) +{ + const char *names = name_block(&hp->mdesc); + u64 last_node = hp->mdesc.node_sz / 16; + void *data = data_block(&hp->mdesc); + struct mdesc_elem *ep; - while (1) { - u64 node = ep - start; + if (node == MDESC_NODE_NULL || node >= last_node) + return NULL; - if (ep->tag == MD_LIST_END) + ep = node_block(&hp->mdesc) + node; + ep++; + for (; ep->tag != MD_NODE_END; ep++) { + void *val = NULL; + int len = 0; + + switch (ep->tag) { + case MD_PROP_VAL: + val = &ep->d.val; + len = 8; break; - if (ep->tag != MD_NODE) { - prom_printf("MDESC: Inconsistent element list.\n"); - prom_halt(); - } - - mdesc_node_alloc(node, ep, names); + case MD_PROP_STR: + case MD_PROP_DATA: + val = data + ep->d.data.data_offset; + len = ep->d.data.data_len; + break; - if (ep->d.val >= last_node) { - printk("MDESC: Warning, early break out of node scan.\n"); - printk("MDESC: Next node [%lu] last_node [%lu].\n", - node, last_node); + default: break; } + if (!val) + continue; - ep = start + ep->d.val; + if (!strcmp(names + ep->name_offset, name)) { + if (lenp) + *lenp = len; + return val; + } } - data = data_block(mdesc); - for (mp = allnodes; mp; mp = mp->allnodes_next) { - struct mdesc_elem *ep = start + mp->node; - struct property **link = &mp->properties; - unsigned int this_arc = 0; - - ep++; - while (ep->tag != MD_NODE_END) { - switch (ep->tag) { - case MD_PROP_ARC: { - struct mdesc_node *target; - - if (this_arc >= mp->num_arcs) { - prom_printf("MDESC: ARC overrun [%u:%u]\n", - this_arc, mp->num_arcs); - prom_halt(); - } - target = find_node(ep->d.val); - if (!target) { - printk("MDESC: Warning, arc points to " - "missing node, ignoring.\n"); - break; - } - mp->arcs[this_arc].name = - (names + ep->name_offset); - mp->arcs[this_arc].arc = target; - this_arc++; - break; - } + return NULL; +} +EXPORT_SYMBOL(mdesc_get_property); - case MD_PROP_VAL: - case MD_PROP_STR: - case MD_PROP_DATA: { - struct property *p = mdesc_early_alloc(sizeof(*p)); - - p->unique_id = unique_id++; - p->name = (char *) names + ep->name_offset; - if (ep->tag == MD_PROP_VAL) { - p->value = &ep->d.val; - p->length = 8; - } else { - p->value = data + ep->d.data.data_offset; - p->length = ep->d.data.data_len; - } - *link = p; - link = &p->next; - break; - } +u64 mdesc_next_arc(struct mdesc_handle *hp, u64 from, const char *arc_type) +{ + struct mdesc_elem *ep, *base = node_block(&hp->mdesc); + const char *names = name_block(&hp->mdesc); + u64 last_node = hp->mdesc.node_sz / 16; - case MD_NOOP: - break; + if (from == MDESC_NODE_NULL || from >= last_node) + return MDESC_NODE_NULL; - default: - printk("MDESC: Warning, ignoring unknown tag type %02x\n", - ep->tag); - } - ep++; - } + ep = base + from; + + ep++; + for (; ep->tag != MD_NODE_END; ep++) { + if (ep->tag != MD_PROP_ARC) + continue; + + if (strcmp(names + ep->name_offset, arc_type)) + continue; + + return ep - base; } + + return MDESC_NODE_NULL; } +EXPORT_SYMBOL(mdesc_next_arc); -static unsigned int __init count_nodes(struct mdesc_hdr *mdesc) +u64 mdesc_arc_target(struct mdesc_handle *hp, u64 arc) { - struct mdesc_elem *ep = node_block(mdesc); - struct mdesc_elem *end; - unsigned int cnt = 0; - - end = ((void *)ep) + mdesc->node_sz; - while (ep < end) { - if (ep->tag == MD_NODE) - cnt++; - ep++; - } - return cnt; + struct mdesc_elem *ep, *base = node_block(&hp->mdesc); + + ep = base + arc; + + return ep->d.val; +} +EXPORT_SYMBOL(mdesc_arc_target); + +const char *mdesc_node_name(struct mdesc_handle *hp, u64 node) +{ + struct mdesc_elem *ep, *base = node_block(&hp->mdesc); + const char *names = name_block(&hp->mdesc); + u64 last_node = hp->mdesc.node_sz / 16; + + if (node == MDESC_NODE_NULL || node >= last_node) + return NULL; + + ep = base + node; + if (ep->tag != MD_NODE) + return NULL; + + return names + ep->name_offset; } +EXPORT_SYMBOL(mdesc_node_name); static void __init report_platform_properties(void) { - struct mdesc_node *pn = md_find_node_by_name(NULL, "platform"); + struct mdesc_handle *hp = mdesc_grab(); + u64 pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform"); const char *s; const u64 *v; - if (!pn) { + if (pn == MDESC_NODE_NULL) { prom_printf("No platform node in machine-description.\n"); prom_halt(); } - s = md_get_property(pn, "banner-name", NULL); + s = mdesc_get_property(hp, pn, "banner-name", NULL); printk("PLATFORM: banner-name [%s]\n", s); - s = md_get_property(pn, "name", NULL); + s = mdesc_get_property(hp, pn, "name", NULL); printk("PLATFORM: name [%s]\n", s); - v = md_get_property(pn, "hostid", NULL); + v = mdesc_get_property(hp, pn, "hostid", NULL); if (v) printk("PLATFORM: hostid [%08lx]\n", *v); - v = md_get_property(pn, "serial#", NULL); + v = mdesc_get_property(hp, pn, "serial#", NULL); if (v) printk("PLATFORM: serial# [%08lx]\n", *v); - v = md_get_property(pn, "stick-frequency", NULL); + v = mdesc_get_property(hp, pn, "stick-frequency", NULL); printk("PLATFORM: stick-frequency [%08lx]\n", *v); - v = md_get_property(pn, "mac-address", NULL); + v = mdesc_get_property(hp, pn, "mac-address", NULL); if (v) printk("PLATFORM: mac-address [%lx]\n", *v); - v = md_get_property(pn, "watchdog-resolution", NULL); + v = mdesc_get_property(hp, pn, "watchdog-resolution", NULL); if (v) printk("PLATFORM: watchdog-resolution [%lu ms]\n", *v); - v = md_get_property(pn, "watchdog-max-timeout", NULL); + v = mdesc_get_property(hp, pn, "watchdog-max-timeout", NULL); if (v) printk("PLATFORM: watchdog-max-timeout [%lu ms]\n", *v); - v = md_get_property(pn, "max-cpus", NULL); + v = mdesc_get_property(hp, pn, "max-cpus", NULL); if (v) printk("PLATFORM: max-cpus [%lu]\n", *v); + +#ifdef CONFIG_SMP + { + int max_cpu, i; + + if (v) { + max_cpu = *v; + if (max_cpu > NR_CPUS) + max_cpu = NR_CPUS; + } else { + max_cpu = NR_CPUS; + } + for (i = 0; i < max_cpu; i++) + cpu_set(i, cpu_possible_map); + } +#endif + + mdesc_release(hp); } static int inline find_in_proplist(const char *list, const char *match, int len) @@ -369,15 +468,17 @@ static int inline find_in_proplist(const char *list, const char *match, int len) return 0; } -static void __init fill_in_one_cache(cpuinfo_sparc *c, struct mdesc_node *mp) +static void __devinit fill_in_one_cache(cpuinfo_sparc *c, + struct mdesc_handle *hp, + u64 mp) { - const u64 *level = md_get_property(mp, "level", NULL); - const u64 *size = md_get_property(mp, "size", NULL); - const u64 *line_size = md_get_property(mp, "line-size", NULL); + const u64 *level = mdesc_get_property(hp, mp, "level", NULL); + const u64 *size = mdesc_get_property(hp, mp, "size", NULL); + const u64 *line_size = mdesc_get_property(hp, mp, "line-size", NULL); const char *type; int type_len; - type = md_get_property(mp, "type", &type_len); + type = mdesc_get_property(hp, mp, "type", &type_len); switch (*level) { case 1: @@ -400,48 +501,45 @@ static void __init fill_in_one_cache(cpuinfo_sparc *c, struct mdesc_node *mp) } if (*level == 1) { - unsigned int i; - - for (i = 0; i < mp->num_arcs; i++) { - struct mdesc_node *t = mp->arcs[i].arc; + u64 a; - if (strcmp(mp->arcs[i].name, "fwd")) - continue; + mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) { + u64 target = mdesc_arc_target(hp, a); + const char *name = mdesc_node_name(hp, target); - if (!strcmp(t->name, "cache")) - fill_in_one_cache(c, t); + if (!strcmp(name, "cache")) + fill_in_one_cache(c, hp, target); } } } -static void __init mark_core_ids(struct mdesc_node *mp, int core_id) +static void __devinit mark_core_ids(struct mdesc_handle *hp, u64 mp, + int core_id) { - unsigned int i; + u64 a; - for (i = 0; i < mp->num_arcs; i++) { - struct mdesc_node *t = mp->arcs[i].arc; + mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_BACK) { + u64 t = mdesc_arc_target(hp, a); + const char *name; const u64 *id; - if (strcmp(mp->arcs[i].name, "back")) - continue; - - if (!strcmp(t->name, "cpu")) { - id = md_get_property(t, "id", NULL); + name = mdesc_node_name(hp, t); + if (!strcmp(name, "cpu")) { + id = mdesc_get_property(hp, t, "id", NULL); if (*id < NR_CPUS) cpu_data(*id).core_id = core_id; } else { - unsigned int j; + u64 j; - for (j = 0; j < t->num_arcs; j++) { - struct mdesc_node *n = t->arcs[j].arc; + mdesc_for_each_arc(j, hp, t, MDESC_ARC_TYPE_BACK) { + u64 n = mdesc_arc_target(hp, j); + const char *n_name; - if (strcmp(t->arcs[j].name, "back")) + n_name = mdesc_node_name(hp, n); + if (strcmp(n_name, "cpu")) continue; - if (strcmp(n->name, "cpu")) - continue; - - id = md_get_property(n, "id", NULL); + id = mdesc_get_property(hp, n, "id", NULL); if (*id < NR_CPUS) cpu_data(*id).core_id = core_id; } @@ -449,78 +547,81 @@ static void __init mark_core_ids(struct mdesc_node *mp, int core_id) } } -static void __init set_core_ids(void) +static void __devinit set_core_ids(struct mdesc_handle *hp) { - struct mdesc_node *mp; int idx; + u64 mp; idx = 1; - md_for_each_node_by_name(mp, "cache") { - const u64 *level = md_get_property(mp, "level", NULL); + mdesc_for_each_node_by_name(hp, mp, "cache") { + const u64 *level; const char *type; int len; + level = mdesc_get_property(hp, mp, "level", NULL); if (*level != 1) continue; - type = md_get_property(mp, "type", &len); + type = mdesc_get_property(hp, mp, "type", &len); if (!find_in_proplist(type, "instn", len)) continue; - mark_core_ids(mp, idx); + mark_core_ids(hp, mp, idx); idx++; } } -static void __init mark_proc_ids(struct mdesc_node *mp, int proc_id) +static void __devinit mark_proc_ids(struct mdesc_handle *hp, u64 mp, + int proc_id) { - int i; + u64 a; - for (i = 0; i < mp->num_arcs; i++) { - struct mdesc_node *t = mp->arcs[i].arc; + mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_BACK) { + u64 t = mdesc_arc_target(hp, a); + const char *name; const u64 *id; - if (strcmp(mp->arcs[i].name, "back")) - continue; - - if (strcmp(t->name, "cpu")) + name = mdesc_node_name(hp, t); + if (strcmp(name, "cpu")) continue; - id = md_get_property(t, "id", NULL); + id = mdesc_get_property(hp, t, "id", NULL); if (*id < NR_CPUS) cpu_data(*id).proc_id = proc_id; } } -static void __init __set_proc_ids(const char *exec_unit_name) +static void __devinit __set_proc_ids(struct mdesc_handle *hp, + const char *exec_unit_name) { - struct mdesc_node *mp; int idx; + u64 mp; idx = 0; - md_for_each_node_by_name(mp, exec_unit_name) { + mdesc_for_each_node_by_name(hp, mp, exec_unit_name) { const char *type; int len; - type = md_get_property(mp, "type", &len); + type = mdesc_get_property(hp, mp, "type", &len); if (!find_in_proplist(type, "int", len) && !find_in_proplist(type, "integer", len)) continue; - mark_proc_ids(mp, idx); + mark_proc_ids(hp, mp, idx); idx++; } } -static void __init set_proc_ids(void) +static void __devinit set_proc_ids(struct mdesc_handle *hp) { - __set_proc_ids("exec_unit"); - __set_proc_ids("exec-unit"); + __set_proc_ids(hp, "exec_unit"); + __set_proc_ids(hp, "exec-unit"); } -static void __init get_one_mondo_bits(const u64 *p, unsigned int *mask, unsigned char def) +static void __devinit get_one_mondo_bits(const u64 *p, unsigned int *mask, + unsigned char def) { u64 val; @@ -538,35 +639,37 @@ use_default: *mask = ((1U << def) * 64U) - 1U; } -static void __init get_mondo_data(struct mdesc_node *mp, struct trap_per_cpu *tb) +static void __devinit get_mondo_data(struct mdesc_handle *hp, u64 mp, + struct trap_per_cpu *tb) { const u64 *val; - val = md_get_property(mp, "q-cpu-mondo-#bits", NULL); + val = mdesc_get_property(hp, mp, "q-cpu-mondo-#bits", NULL); get_one_mondo_bits(val, &tb->cpu_mondo_qmask, 7); - val = md_get_property(mp, "q-dev-mondo-#bits", NULL); + val = mdesc_get_property(hp, mp, "q-dev-mondo-#bits", NULL); get_one_mondo_bits(val, &tb->dev_mondo_qmask, 7); - val = md_get_property(mp, "q-resumable-#bits", NULL); + val = mdesc_get_property(hp, mp, "q-resumable-#bits", NULL); get_one_mondo_bits(val, &tb->resum_qmask, 6); - val = md_get_property(mp, "q-nonresumable-#bits", NULL); + val = mdesc_get_property(hp, mp, "q-nonresumable-#bits", NULL); get_one_mondo_bits(val, &tb->nonresum_qmask, 2); } -static void __init mdesc_fill_in_cpu_data(void) +void __devinit mdesc_fill_in_cpu_data(cpumask_t mask) { - struct mdesc_node *mp; + struct mdesc_handle *hp = mdesc_grab(); + u64 mp; ncpus_probed = 0; - md_for_each_node_by_name(mp, "cpu") { - const u64 *id = md_get_property(mp, "id", NULL); - const u64 *cfreq = md_get_property(mp, "clock-frequency", NULL); + mdesc_for_each_node_by_name(hp, mp, "cpu") { + const u64 *id = mdesc_get_property(hp, mp, "id", NULL); + const u64 *cfreq = mdesc_get_property(hp, mp, "clock-frequency", NULL); struct trap_per_cpu *tb; cpuinfo_sparc *c; - unsigned int i; int cpuid; + u64 a; ncpus_probed++; @@ -575,6 +678,8 @@ static void __init mdesc_fill_in_cpu_data(void) #ifdef CONFIG_SMP if (cpuid >= NR_CPUS) continue; + if (!cpu_isset(cpuid, mask)) + continue; #else /* On uniprocessor we only want the values for the * real physical cpu the kernel booted onto, however @@ -589,35 +694,30 @@ static void __init mdesc_fill_in_cpu_data(void) c->clock_tick = *cfreq; tb = &trap_block[cpuid]; - get_mondo_data(mp, tb); - - for (i = 0; i < mp->num_arcs; i++) { - struct mdesc_node *t = mp->arcs[i].arc; - unsigned int j; + get_mondo_data(hp, mp, tb); - if (strcmp(mp->arcs[i].name, "fwd")) - continue; + mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) { + u64 j, t = mdesc_arc_target(hp, a); + const char *t_name; - if (!strcmp(t->name, "cache")) { - fill_in_one_cache(c, t); + t_name = mdesc_node_name(hp, t); + if (!strcmp(t_name, "cache")) { + fill_in_one_cache(c, hp, t); continue; } - for (j = 0; j < t->num_arcs; j++) { - struct mdesc_node *n; + mdesc_for_each_arc(j, hp, t, MDESC_ARC_TYPE_FWD) { + u64 n = mdesc_arc_target(hp, j); + const char *n_name; - n = t->arcs[j].arc; - if (strcmp(t->arcs[j].name, "fwd")) - continue; - - if (!strcmp(n->name, "cache")) - fill_in_one_cache(c, n); + n_name = mdesc_node_name(hp, n); + if (!strcmp(n_name, "cache")) + fill_in_one_cache(c, hp, n); } } #ifdef CONFIG_SMP cpu_set(cpuid, cpu_present_map); - cpu_set(cpuid, phys_cpu_present_map); #endif c->core_id = 0; @@ -628,45 +728,43 @@ static void __init mdesc_fill_in_cpu_data(void) sparc64_multi_core = 1; #endif - set_core_ids(); - set_proc_ids(); + set_core_ids(hp); + set_proc_ids(hp); smp_fill_in_sib_core_maps(); + + mdesc_release(hp); } void __init sun4v_mdesc_init(void) { + struct mdesc_handle *hp; unsigned long len, real_len, status; + cpumask_t mask; (void) sun4v_mach_desc(0UL, 0UL, &len); printk("MDESC: Size is %lu bytes.\n", len); - main_mdesc = mdesc_early_alloc(len); + hp = mdesc_alloc(len, &bootmem_mdesc_memops); + if (hp == NULL) { + prom_printf("MDESC: alloc of %lu bytes failed.\n", len); + prom_halt(); + } - status = sun4v_mach_desc(__pa(main_mdesc), len, &real_len); + status = sun4v_mach_desc(__pa(&hp->mdesc), len, &real_len); if (status != HV_EOK || real_len > len) { prom_printf("sun4v_mach_desc fails, err(%lu), " "len(%lu), real_len(%lu)\n", status, len, real_len); + mdesc_free(hp); prom_halt(); } - len = count_nodes(main_mdesc); - printk("MDESC: %lu nodes.\n", len); - - len = roundup_pow_of_two(len); - - mdesc_hash = mdesc_early_alloc(len * sizeof(struct mdesc_node *)); - mdesc_hash_size = len; - - printk("MDESC: Hash size %lu entries.\n", len); - - build_all_nodes(main_mdesc); - - printk("MDESC: Built graph with %u bytes of memory.\n", - mdesc_early_allocated); + cur_mdesc = hp; report_platform_properties(); - mdesc_fill_in_cpu_data(); + + cpus_setall(mask); + mdesc_fill_in_cpu_data(mask); } diff --git a/arch/sparc64/kernel/power.c b/arch/sparc64/kernel/power.c index 5d6adea3967f..8dd4294ad21e 100644 --- a/arch/sparc64/kernel/power.c +++ b/arch/sparc64/kernel/power.c @@ -1,7 +1,6 @@ -/* $Id: power.c,v 1.10 2001/12/11 01:57:16 davem Exp $ - * power.c: Power management driver. +/* power.c: Power management driver. * - * Copyright (C) 1999 David S. Miller (davem@redhat.com) + * Copyright (C) 1999, 2007 David S. Miller (davem@davemloft.net) */ #include <linux/kernel.h> @@ -19,6 +18,7 @@ #include <asm/prom.h> #include <asm/of_device.h> #include <asm/io.h> +#include <asm/power.h> #include <asm/sstate.h> #include <linux/unistd.h> @@ -29,24 +29,26 @@ */ int scons_pwroff = 1; -#ifdef CONFIG_PCI -#include <linux/pci.h> static void __iomem *power_reg; static DECLARE_WAIT_QUEUE_HEAD(powerd_wait); static int button_pressed; -static irqreturn_t power_handler(int irq, void *dev_id) +void wake_up_powerd(void) { if (button_pressed == 0) { button_pressed = 1; wake_up(&powerd_wait); } +} + +static irqreturn_t power_handler(int irq, void *dev_id) +{ + wake_up_powerd(); /* FIXME: Check registers for status... */ return IRQ_HANDLED; } -#endif /* CONFIG_PCI */ extern void machine_halt(void); extern void machine_alt_power_off(void); @@ -56,19 +58,18 @@ void machine_power_off(void) { sstate_poweroff(); if (!serial_console || scons_pwroff) { -#ifdef CONFIG_PCI if (power_reg) { /* Both register bits seem to have the * same effect, so until I figure out * what the difference is... */ writel(AUXIO_PCIO_CPWR_OFF | AUXIO_PCIO_SPWR_OFF, power_reg); - } else -#endif /* CONFIG_PCI */ + } else { if (poweroff_method != NULL) { poweroff_method(); /* not reached */ } + } } machine_halt(); } @@ -76,7 +77,6 @@ void machine_power_off(void) void (*pm_power_off)(void) = machine_power_off; EXPORT_SYMBOL(pm_power_off); -#ifdef CONFIG_PCI static int powerd(void *__unused) { static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; @@ -86,7 +86,7 @@ static int powerd(void *__unused) daemonize("powerd"); add_wait_queue(&powerd_wait, &wait); -again: + for (;;) { set_task_state(current, TASK_INTERRUPTIBLE); if (button_pressed) @@ -100,16 +100,28 @@ again: /* Ok, down we go... */ button_pressed = 0; if (kernel_execve("/sbin/shutdown", argv, envp) < 0) { - printk("powerd: shutdown execution failed\n"); - add_wait_queue(&powerd_wait, &wait); - goto again; + printk(KERN_ERR "powerd: shutdown execution failed\n"); + machine_power_off(); } return 0; } +int start_powerd(void) +{ + int err; + + err = kernel_thread(powerd, NULL, CLONE_FS); + if (err < 0) + printk(KERN_ERR "power: Failed to start power daemon.\n"); + else + printk(KERN_INFO "power: powerd running.\n"); + + return err; +} + static int __init has_button_interrupt(unsigned int irq, struct device_node *dp) { - if (irq == PCI_IRQ_NONE) + if (irq == 0xffffffff) return 0; if (!of_find_property(dp, "button", NULL)) return 0; @@ -130,17 +142,14 @@ static int __devinit power_probe(struct of_device *op, const struct of_device_id poweroff_method = machine_halt; /* able to use the standard halt */ if (has_button_interrupt(irq, op->node)) { - if (kernel_thread(powerd, NULL, CLONE_FS) < 0) { - printk("Failed to start power daemon.\n"); + if (start_powerd() < 0) return 0; - } - printk("powerd running.\n"); if (request_irq(irq, power_handler, 0, "power", NULL) < 0) - printk("power: Error, cannot register IRQ handler.\n"); + printk(KERN_ERR "power: Cannot setup IRQ handler.\n"); } else { - printk("not using powerd.\n"); + printk(KERN_INFO "power: Not using powerd.\n"); } return 0; @@ -164,4 +173,3 @@ void __init power_init(void) of_register_driver(&power_driver, &of_bus_type); return; } -#endif /* CONFIG_PCI */ diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index f5f97e2c669c..93557507ec9f 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -29,6 +29,7 @@ #include <linux/compat.h> #include <linux/tick.h> #include <linux/init.h> +#include <linux/cpu.h> #include <asm/oplib.h> #include <asm/uaccess.h> @@ -49,7 +50,7 @@ /* #define VERBOSE_SHOWREGS */ -static void sparc64_yield(void) +static void sparc64_yield(int cpu) { if (tlb_type != hypervisor) return; @@ -57,7 +58,7 @@ static void sparc64_yield(void) clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb__after_clear_bit(); - while (!need_resched()) { + while (!need_resched() && !cpu_is_offline(cpu)) { unsigned long pstate; /* Disable interrupts. */ @@ -68,7 +69,7 @@ static void sparc64_yield(void) : "=&r" (pstate) : "i" (PSTATE_IE)); - if (!need_resched()) + if (!need_resched() && !cpu_is_offline(cpu)) sun4v_cpu_yield(); /* Re-enable interrupts. */ @@ -86,15 +87,25 @@ static void sparc64_yield(void) /* The idle loop on sparc64. */ void cpu_idle(void) { + int cpu = smp_processor_id(); + set_thread_flag(TIF_POLLING_NRFLAG); while(1) { tick_nohz_stop_sched_tick(); - while (!need_resched()) - sparc64_yield(); + + while (!need_resched() && !cpu_is_offline(cpu)) + sparc64_yield(cpu); + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); + +#ifdef CONFIG_HOTPLUG_CPU + if (cpu_is_offline(cpu)) + cpu_play_dead(); +#endif + schedule(); preempt_disable(); } diff --git a/arch/sparc64/kernel/prom.c b/arch/sparc64/kernel/prom.c index 61036b346664..5d220302cd50 100644 --- a/arch/sparc64/kernel/prom.c +++ b/arch/sparc64/kernel/prom.c @@ -1808,7 +1808,7 @@ static void __init of_fill_in_cpu_data(void) #ifdef CONFIG_SMP cpu_set(cpuid, cpu_present_map); - cpu_set(cpuid, phys_cpu_present_map); + cpu_set(cpuid, cpu_possible_map); #endif } diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c index 7490cc670a53..dc928e49e341 100644 --- a/arch/sparc64/kernel/setup.c +++ b/arch/sparc64/kernel/setup.c @@ -442,7 +442,6 @@ static int show_cpuinfo(struct seq_file *m, void *__unused) "D$ parity tl1\t: %u\n" "I$ parity tl1\t: %u\n" #ifndef CONFIG_SMP - "Cpu0Bogo\t: %lu.%02lu\n" "Cpu0ClkTck\t: %016lx\n" #endif , @@ -455,10 +454,8 @@ static int show_cpuinfo(struct seq_file *m, void *__unused) ncpus_probed, num_online_cpus(), dcache_parity_tl1_occurred, - icache_parity_tl1_occurred + icache_parity_tl1_occurred, #ifndef CONFIG_SMP - , cpu_data(0).udelay_val/(500000/HZ), - (cpu_data(0).udelay_val/(5000/HZ)) % 100, cpu_data(0).clock_tick #endif ); diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 40e40f968d61..b448d33321c6 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1,6 +1,6 @@ /* smp.c: Sparc64 SMP support. * - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1997, 2007 David S. Miller (davem@davemloft.net) */ #include <linux/module.h> @@ -28,6 +28,8 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/cpudata.h> +#include <asm/hvtramp.h> +#include <asm/io.h> #include <asm/irq.h> #include <asm/irq_regs.h> @@ -41,22 +43,26 @@ #include <asm/sections.h> #include <asm/prom.h> #include <asm/mdesc.h> +#include <asm/ldc.h> +#include <asm/hypervisor.h> extern void calibrate_delay(void); int sparc64_multi_core __read_mostly; -/* Please don't make this stuff initdata!!! --DaveM */ -unsigned char boot_cpu_id; - +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE; cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE; -cpumask_t phys_cpu_present_map __read_mostly = CPU_MASK_NONE; cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = CPU_MASK_NONE }; cpumask_t cpu_core_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = CPU_MASK_NONE }; + +EXPORT_SYMBOL(cpu_possible_map); +EXPORT_SYMBOL(cpu_online_map); +EXPORT_SYMBOL(cpu_sibling_map); +EXPORT_SYMBOL(cpu_core_map); + static cpumask_t smp_commenced_mask; -static cpumask_t cpu_callout_map; void smp_info(struct seq_file *m) { @@ -73,18 +79,17 @@ void smp_bogo(struct seq_file *m) for_each_online_cpu(i) seq_printf(m, - "Cpu%dBogo\t: %lu.%02lu\n" "Cpu%dClkTck\t: %016lx\n", - i, cpu_data(i).udelay_val / (500000/HZ), - (cpu_data(i).udelay_val / (5000/HZ)) % 100, i, cpu_data(i).clock_tick); } +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); + extern void setup_sparc64_timer(void); static volatile unsigned long callin_flag = 0; -void __init smp_callin(void) +void __devinit smp_callin(void) { int cpuid = hard_smp_processor_id(); @@ -102,8 +107,6 @@ void __init smp_callin(void) local_irq_enable(); - calibrate_delay(); - cpu_data(cpuid).udelay_val = loops_per_jiffy; callin_flag = 1; __asm__ __volatile__("membar #Sync\n\t" "flush %%g6" : : : "memory"); @@ -120,7 +123,9 @@ void __init smp_callin(void) while (!cpu_isset(cpuid, smp_commenced_mask)) rmb(); + spin_lock(&call_lock); cpu_set(cpuid, cpu_online_map); + spin_unlock(&call_lock); /* idle thread is expected to have preempt disabled */ preempt_disable(); @@ -268,6 +273,67 @@ static void smp_synchronize_one_tick(int cpu) spin_unlock_irqrestore(&itc_sync_lock, flags); } +#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU) +/* XXX Put this in some common place. XXX */ +static unsigned long kimage_addr_to_ra(void *p) +{ + unsigned long val = (unsigned long) p; + + return kern_base + (val - KERNBASE); +} + +static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg) +{ + extern unsigned long sparc64_ttable_tl0; + extern unsigned long kern_locked_tte_data; + extern int bigkernel; + struct hvtramp_descr *hdesc; + unsigned long trampoline_ra; + struct trap_per_cpu *tb; + u64 tte_vaddr, tte_data; + unsigned long hv_err; + + hdesc = kzalloc(sizeof(*hdesc), GFP_KERNEL); + if (!hdesc) { + printk(KERN_ERR "ldom_startcpu_cpuid: Cannot allocate " + "hvtramp_descr.\n"); + return; + } + + hdesc->cpu = cpu; + hdesc->num_mappings = (bigkernel ? 2 : 1); + + tb = &trap_block[cpu]; + tb->hdesc = hdesc; + + hdesc->fault_info_va = (unsigned long) &tb->fault_info; + hdesc->fault_info_pa = kimage_addr_to_ra(&tb->fault_info); + + hdesc->thread_reg = thread_reg; + + tte_vaddr = (unsigned long) KERNBASE; + tte_data = kern_locked_tte_data; + + hdesc->maps[0].vaddr = tte_vaddr; + hdesc->maps[0].tte = tte_data; + if (bigkernel) { + tte_vaddr += 0x400000; + tte_data += 0x400000; + hdesc->maps[1].vaddr = tte_vaddr; + hdesc->maps[1].tte = tte_data; + } + + trampoline_ra = kimage_addr_to_ra(hv_cpu_startup); + + hv_err = sun4v_cpu_start(cpu, trampoline_ra, + kimage_addr_to_ra(&sparc64_ttable_tl0), + __pa(hdesc)); + if (hv_err) + printk(KERN_ERR "ldom_startcpu_cpuid: sun4v_cpu_start() " + "gives error %lu\n", hv_err); +} +#endif + extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load); extern unsigned long sparc64_cpu_startup; @@ -280,6 +346,7 @@ static struct thread_info *cpu_new_thread = NULL; static int __devinit smp_boot_one_cpu(unsigned int cpu) { + struct trap_per_cpu *tb = &trap_block[cpu]; unsigned long entry = (unsigned long)(&sparc64_cpu_startup); unsigned long cookie = @@ -290,20 +357,25 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu) p = fork_idle(cpu); callin_flag = 0; cpu_new_thread = task_thread_info(p); - cpu_set(cpu, cpu_callout_map); if (tlb_type == hypervisor) { /* Alloc the mondo queues, cpu will load them. */ sun4v_init_mondo_queues(0, cpu, 1, 0); - prom_startcpu_cpuid(cpu, entry, cookie); +#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU) + if (ldom_domaining_enabled) + ldom_startcpu_cpuid(cpu, + (unsigned long) cpu_new_thread); + else +#endif + prom_startcpu_cpuid(cpu, entry, cookie); } else { struct device_node *dp = of_find_node_by_cpuid(cpu); prom_startcpu(dp->node, entry, cookie); } - for (timeout = 0; timeout < 5000000; timeout++) { + for (timeout = 0; timeout < 50000; timeout++) { if (callin_flag) break; udelay(100); @@ -313,11 +385,15 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu) ret = 0; } else { printk("Processor %d is stuck.\n", cpu); - cpu_clear(cpu, cpu_callout_map); ret = -ENODEV; } cpu_new_thread = NULL; + if (tb->hdesc) { + kfree(tb->hdesc); + tb->hdesc = NULL; + } + return ret; } @@ -720,7 +796,6 @@ struct call_data_struct { int wait; }; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); static struct call_data_struct *call_data; extern unsigned long xcall_call_function; @@ -1152,34 +1227,14 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs) preempt_enable(); } -void __init smp_tick_init(void) -{ - boot_cpu_id = hard_smp_processor_id(); -} - /* /proc/profile writes can call this, don't __init it please. */ int setup_profiling_timer(unsigned int multiplier) { return -EINVAL; } -/* Constrain the number of cpus to max_cpus. */ void __init smp_prepare_cpus(unsigned int max_cpus) { - int i; - - if (num_possible_cpus() > max_cpus) { - for_each_possible_cpu(i) { - if (i != boot_cpu_id) { - cpu_clear(i, phys_cpu_present_map); - cpu_clear(i, cpu_present_map); - if (num_possible_cpus() <= max_cpus) - break; - } - } - } - - cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; } void __devinit smp_prepare_boot_cpu(void) @@ -1190,30 +1245,32 @@ void __devinit smp_fill_in_sib_core_maps(void) { unsigned int i; - for_each_possible_cpu(i) { + for_each_present_cpu(i) { unsigned int j; + cpus_clear(cpu_core_map[i]); if (cpu_data(i).core_id == 0) { cpu_set(i, cpu_core_map[i]); continue; } - for_each_possible_cpu(j) { + for_each_present_cpu(j) { if (cpu_data(i).core_id == cpu_data(j).core_id) cpu_set(j, cpu_core_map[i]); } } - for_each_possible_cpu(i) { + for_each_present_cpu(i) { unsigned int j; + cpus_clear(cpu_sibling_map[i]); if (cpu_data(i).proc_id == -1) { cpu_set(i, cpu_sibling_map[i]); continue; } - for_each_possible_cpu(j) { + for_each_present_cpu(j) { if (cpu_data(i).proc_id == cpu_data(j).proc_id) cpu_set(j, cpu_sibling_map[i]); @@ -1242,18 +1299,112 @@ int __cpuinit __cpu_up(unsigned int cpu) return ret; } -void __init smp_cpus_done(unsigned int max_cpus) +#ifdef CONFIG_HOTPLUG_CPU +void cpu_play_dead(void) +{ + int cpu = smp_processor_id(); + unsigned long pstate; + + idle_task_exit(); + + if (tlb_type == hypervisor) { + struct trap_per_cpu *tb = &trap_block[cpu]; + + sun4v_cpu_qconf(HV_CPU_QUEUE_CPU_MONDO, + tb->cpu_mondo_pa, 0); + sun4v_cpu_qconf(HV_CPU_QUEUE_DEVICE_MONDO, + tb->dev_mondo_pa, 0); + sun4v_cpu_qconf(HV_CPU_QUEUE_RES_ERROR, + tb->resum_mondo_pa, 0); + sun4v_cpu_qconf(HV_CPU_QUEUE_NONRES_ERROR, + tb->nonresum_mondo_pa, 0); + } + + cpu_clear(cpu, smp_commenced_mask); + membar_safe("#Sync"); + + local_irq_disable(); + + __asm__ __volatile__( + "rdpr %%pstate, %0\n\t" + "wrpr %0, %1, %%pstate" + : "=r" (pstate) + : "i" (PSTATE_IE)); + + while (1) + barrier(); +} + +int __cpu_disable(void) { - unsigned long bogosum = 0; + int cpu = smp_processor_id(); + cpuinfo_sparc *c; int i; - for_each_online_cpu(i) - bogosum += cpu_data(i).udelay_val; - printk("Total of %ld processors activated " - "(%lu.%02lu BogoMIPS).\n", - (long) num_online_cpus(), - bogosum/(500000/HZ), - (bogosum/(5000/HZ))%100); + for_each_cpu_mask(i, cpu_core_map[cpu]) + cpu_clear(cpu, cpu_core_map[i]); + cpus_clear(cpu_core_map[cpu]); + + for_each_cpu_mask(i, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[i]); + cpus_clear(cpu_sibling_map[cpu]); + + c = &cpu_data(cpu); + + c->core_id = 0; + c->proc_id = -1; + + spin_lock(&call_lock); + cpu_clear(cpu, cpu_online_map); + spin_unlock(&call_lock); + + smp_wmb(); + + /* Make sure no interrupts point to this cpu. */ + fixup_irqs(); + + local_irq_enable(); + mdelay(1); + local_irq_disable(); + + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + int i; + + for (i = 0; i < 100; i++) { + smp_rmb(); + if (!cpu_isset(cpu, smp_commenced_mask)) + break; + msleep(100); + } + if (cpu_isset(cpu, smp_commenced_mask)) { + printk(KERN_ERR "CPU %u didn't die...\n", cpu); + } else { +#if defined(CONFIG_SUN_LDOMS) + unsigned long hv_err; + int limit = 100; + + do { + hv_err = sun4v_cpu_stop(cpu); + if (hv_err == HV_EOK) { + cpu_clear(cpu, cpu_present_map); + break; + } + } while (--limit > 0); + if (limit <= 0) { + printk(KERN_ERR "sun4v_cpu_stop() fails err=%lu\n", + hv_err); + } +#endif + } +} +#endif + +void __init smp_cpus_done(unsigned int max_cpus) +{ } void smp_send_reschedule(int cpu) diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 6fa761612899..719d676c2ddc 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -1,7 +1,6 @@ -/* $Id: sparc64_ksyms.c,v 1.121 2002/02/09 19:49:31 davem Exp $ - * arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support. +/* arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support. * - * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1996, 2007 David S. Miller (davem@davemloft.net) * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) */ @@ -28,7 +27,6 @@ #include <net/compat.h> #include <asm/oplib.h> -#include <asm/delay.h> #include <asm/system.h> #include <asm/auxio.h> #include <asm/pgtable.h> @@ -124,10 +122,6 @@ EXPORT_SYMBOL(__write_lock); EXPORT_SYMBOL(__write_unlock); EXPORT_SYMBOL(__write_trylock); -/* CPU online map and active count. */ -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(phys_cpu_present_map); - EXPORT_SYMBOL(smp_call_function); #endif /* CONFIG_SMP */ @@ -330,12 +324,6 @@ EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(strncmp); -/* Delay routines. */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__const_udelay); -EXPORT_SYMBOL(__delay); - void VISenter(void); /* RAID code needs this */ EXPORT_SYMBOL(VISenter); diff --git a/arch/sparc64/kernel/sysfs.c b/arch/sparc64/kernel/sysfs.c index cdb1477af89f..52816c7be0b9 100644 --- a/arch/sparc64/kernel/sysfs.c +++ b/arch/sparc64/kernel/sysfs.c @@ -193,7 +193,6 @@ static ssize_t show_##NAME(struct sys_device *dev, char *buf) \ } SHOW_CPUDATA_ULONG_NAME(clock_tick, clock_tick); -SHOW_CPUDATA_ULONG_NAME(udelay_val, udelay_val); SHOW_CPUDATA_UINT_NAME(l1_dcache_size, dcache_size); SHOW_CPUDATA_UINT_NAME(l1_dcache_line_size, dcache_line_size); SHOW_CPUDATA_UINT_NAME(l1_icache_size, icache_size); @@ -203,7 +202,6 @@ SHOW_CPUDATA_UINT_NAME(l2_cache_line_size, ecache_line_size); static struct sysdev_attribute cpu_core_attrs[] = { _SYSDEV_ATTR(clock_tick, 0444, show_clock_tick, NULL), - _SYSDEV_ATTR(udelay_val, 0444, show_udelay_val, NULL), _SYSDEV_ATTR(l1_dcache_size, 0444, show_l1_dcache_size, NULL), _SYSDEV_ATTR(l1_dcache_line_size, 0444, show_l1_dcache_line_size, NULL), _SYSDEV_ATTR(l1_icache_size, 0444, show_l1_icache_size, NULL), diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c index a31a0439244f..62e316ab1339 100644 --- a/arch/sparc64/kernel/time.c +++ b/arch/sparc64/kernel/time.c @@ -849,9 +849,6 @@ static unsigned long sparc64_init_timers(void) { struct device_node *dp; unsigned long clock; -#ifdef CONFIG_SMP - extern void smp_tick_init(void); -#endif dp = of_find_node_by_path("/"); if (tlb_type == spitfire) { @@ -874,10 +871,6 @@ static unsigned long sparc64_init_timers(void) clock = of_getintprop_default(dp, "stick-frequency", 0); } -#ifdef CONFIG_SMP - smp_tick_init(); -#endif - return clock; } @@ -1038,10 +1031,31 @@ static void __init setup_clockevent_multiplier(unsigned long hz) sparc64_clockevent.mult = mult; } +static unsigned long tb_ticks_per_usec __read_mostly; + +void __delay(unsigned long loops) +{ + unsigned long bclock, now; + + bclock = tick_ops->get_tick(); + do { + now = tick_ops->get_tick(); + } while ((now-bclock) < loops); +} +EXPORT_SYMBOL(__delay); + +void udelay(unsigned long usecs) +{ + __delay(tb_ticks_per_usec * usecs); +} +EXPORT_SYMBOL(udelay); + void __init time_init(void) { unsigned long clock = sparc64_init_timers(); + tb_ticks_per_usec = clock / USEC_PER_SEC; + timer_ticks_per_nsec_quotient = clocksource_hz2mult(clock, SPARC64_NSEC_PER_CYC_SHIFT); diff --git a/arch/sparc64/kernel/vio.c b/arch/sparc64/kernel/vio.c new file mode 100644 index 000000000000..49569b44ea1f --- /dev/null +++ b/arch/sparc64/kernel/vio.c @@ -0,0 +1,395 @@ +/* vio.c: Virtual I/O channel devices probing infrastructure. + * + * Copyright (c) 2003-2005 IBM Corp. + * Dave Engebretsen engebret@us.ibm.com + * Santiago Leon santil@us.ibm.com + * Hollis Blanchard <hollisb@us.ibm.com> + * Stephen Rothwell + * + * Adapted to sparc64 by David S. Miller davem@davemloft.net + */ + +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/init.h> + +#include <asm/mdesc.h> +#include <asm/vio.h> + +static inline int find_in_proplist(const char *list, const char *match, + int len) +{ + while (len > 0) { + int l; + + if (!strcmp(list, match)) + return 1; + l = strlen(list) + 1; + list += l; + len -= l; + } + return 0; +} + +static const struct vio_device_id *vio_match_device( + const struct vio_device_id *matches, + const struct vio_dev *dev) +{ + const char *type, *compat; + int len; + + type = dev->type; + compat = dev->compat; + len = dev->compat_len; + + while (matches->type[0] || matches->compat[0]) { + int match = 1; + if (matches->type[0]) + match &= !strcmp(matches->type, type); + + if (matches->compat[0]) { + match &= len && + find_in_proplist(compat, matches->compat, len); + } + if (match) + return matches; + matches++; + } + return NULL; +} + +static int vio_bus_match(struct device *dev, struct device_driver *drv) +{ + struct vio_dev *vio_dev = to_vio_dev(dev); + struct vio_driver *vio_drv = to_vio_driver(drv); + const struct vio_device_id *matches = vio_drv->id_table; + + if (!matches) + return 0; + + return vio_match_device(matches, vio_dev) != NULL; +} + +static int vio_device_probe(struct device *dev) +{ + struct vio_dev *vdev = to_vio_dev(dev); + struct vio_driver *drv = to_vio_driver(dev->driver); + const struct vio_device_id *id; + int error = -ENODEV; + + if (drv->probe) { + id = vio_match_device(drv->id_table, vdev); + if (id) + error = drv->probe(vdev, id); + } + + return error; +} + +static int vio_device_remove(struct device *dev) +{ + struct vio_dev *vdev = to_vio_dev(dev); + struct vio_driver *drv = to_vio_driver(dev->driver); + + if (drv->remove) + return drv->remove(vdev); + + return 1; +} + +static ssize_t devspec_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vio_dev *vdev = to_vio_dev(dev); + const char *str = "none"; + + if (!strcmp(vdev->type, "network")) + str = "vnet"; + else if (!strcmp(vdev->type, "block")) + str = "vdisk"; + + return sprintf(buf, "%s\n", str); +} + +static ssize_t type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vio_dev *vdev = to_vio_dev(dev); + return sprintf(buf, "%s\n", vdev->type); +} + +static struct device_attribute vio_dev_attrs[] = { + __ATTR_RO(devspec), + __ATTR_RO(type), + __ATTR_NULL +}; + +static struct bus_type vio_bus_type = { + .name = "vio", + .dev_attrs = vio_dev_attrs, + .match = vio_bus_match, + .probe = vio_device_probe, + .remove = vio_device_remove, +}; + +int vio_register_driver(struct vio_driver *viodrv) +{ + viodrv->driver.bus = &vio_bus_type; + + return driver_register(&viodrv->driver); +} +EXPORT_SYMBOL(vio_register_driver); + +void vio_unregister_driver(struct vio_driver *viodrv) +{ + driver_unregister(&viodrv->driver); +} +EXPORT_SYMBOL(vio_unregister_driver); + +static void __devinit vio_dev_release(struct device *dev) +{ + kfree(to_vio_dev(dev)); +} + +static ssize_t +show_pciobppath_attr(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct vio_dev *vdev; + struct device_node *dp; + + vdev = to_vio_dev(dev); + dp = vdev->dp; + + return snprintf (buf, PAGE_SIZE, "%s\n", dp->full_name); +} + +static DEVICE_ATTR(obppath, S_IRUSR | S_IRGRP | S_IROTH, + show_pciobppath_attr, NULL); + +struct device_node *cdev_node; + +static struct vio_dev *root_vdev; +static u64 cdev_cfg_handle; + +static void vio_fill_channel_info(struct mdesc_handle *hp, u64 mp, + struct vio_dev *vdev) +{ + u64 a; + + mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) { + const u64 *chan_id; + const u64 *irq; + u64 target; + + target = mdesc_arc_target(hp, a); + + irq = mdesc_get_property(hp, target, "tx-ino", NULL); + if (irq) + vdev->tx_irq = sun4v_build_virq(cdev_cfg_handle, *irq); + + irq = mdesc_get_property(hp, target, "rx-ino", NULL); + if (irq) + vdev->rx_irq = sun4v_build_virq(cdev_cfg_handle, *irq); + + chan_id = mdesc_get_property(hp, target, "id", NULL); + if (chan_id) + vdev->channel_id = *chan_id; + } +} + +static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp, + struct device *parent) +{ + const char *type, *compat; + struct device_node *dp; + struct vio_dev *vdev; + int err, tlen, clen; + + type = mdesc_get_property(hp, mp, "device-type", &tlen); + if (!type) { + type = mdesc_get_property(hp, mp, "name", &tlen); + if (!type) { + type = mdesc_node_name(hp, mp); + tlen = strlen(type) + 1; + } + } + if (tlen > VIO_MAX_TYPE_LEN) { + printk(KERN_ERR "VIO: Type string [%s] is too long.\n", + type); + return NULL; + } + + compat = mdesc_get_property(hp, mp, "device-type", &clen); + if (!compat) { + clen = 0; + } else if (clen > VIO_MAX_COMPAT_LEN) { + printk(KERN_ERR "VIO: Compat len %d for [%s] is too long.\n", + clen, type); + return NULL; + } + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) { + printk(KERN_ERR "VIO: Could not allocate vio_dev\n"); + return NULL; + } + + vdev->mp = mp; + memcpy(vdev->type, type, tlen); + if (compat) + memcpy(vdev->compat, compat, clen); + else + memset(vdev->compat, 0, sizeof(vdev->compat)); + vdev->compat_len = clen; + + vdev->channel_id = ~0UL; + vdev->tx_irq = ~0; + vdev->rx_irq = ~0; + + vio_fill_channel_info(hp, mp, vdev); + + snprintf(vdev->dev.bus_id, BUS_ID_SIZE, "%lx", mp); + vdev->dev.parent = parent; + vdev->dev.bus = &vio_bus_type; + vdev->dev.release = vio_dev_release; + + if (parent == NULL) { + dp = cdev_node; + } else if (to_vio_dev(parent) == root_vdev) { + dp = of_get_next_child(cdev_node, NULL); + while (dp) { + if (!strcmp(dp->type, type)) + break; + + dp = of_get_next_child(cdev_node, dp); + } + } else { + dp = to_vio_dev(parent)->dp; + } + vdev->dp = dp; + + err = device_register(&vdev->dev); + if (err) { + printk(KERN_ERR "VIO: Could not register device %s, err=%d\n", + vdev->dev.bus_id, err); + kfree(vdev); + return NULL; + } + if (vdev->dp) + err = sysfs_create_file(&vdev->dev.kobj, + &dev_attr_obppath.attr); + + return vdev; +} + +static void walk_tree(struct mdesc_handle *hp, u64 n, struct vio_dev *parent) +{ + u64 a; + + mdesc_for_each_arc(a, hp, n, MDESC_ARC_TYPE_FWD) { + struct vio_dev *vdev; + u64 target; + + target = mdesc_arc_target(hp, a); + vdev = vio_create_one(hp, target, &parent->dev); + if (vdev) + walk_tree(hp, target, vdev); + } +} + +static void create_devices(struct mdesc_handle *hp, u64 root) +{ + u64 mp; + + root_vdev = vio_create_one(hp, root, NULL); + if (!root_vdev) { + printk(KERN_ERR "VIO: Coult not create root device.\n"); + return; + } + + walk_tree(hp, root, root_vdev); + + /* Domain services is odd as it doesn't sit underneath the + * channel-devices node, so we plug it in manually. + */ + mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "domain-services"); + if (mp != MDESC_NODE_NULL) { + struct vio_dev *parent = vio_create_one(hp, mp, + &root_vdev->dev); + + if (parent) + walk_tree(hp, mp, parent); + } +} + +const char *channel_devices_node = "channel-devices"; +const char *channel_devices_compat = "SUNW,sun4v-channel-devices"; +const char *cfg_handle_prop = "cfg-handle"; + +static int __init vio_init(void) +{ + struct mdesc_handle *hp; + const char *compat; + const u64 *cfg_handle; + int err, len; + u64 root; + + err = bus_register(&vio_bus_type); + if (err) { + printk(KERN_ERR "VIO: Could not register bus type err=%d\n", + err); + return err; + } + + hp = mdesc_grab(); + if (!hp) + return 0; + + root = mdesc_node_by_name(hp, MDESC_NODE_NULL, channel_devices_node); + if (root == MDESC_NODE_NULL) { + printk(KERN_INFO "VIO: No channel-devices MDESC node.\n"); + mdesc_release(hp); + return 0; + } + + cdev_node = of_find_node_by_name(NULL, "channel-devices"); + err = -ENODEV; + if (!cdev_node) { + printk(KERN_INFO "VIO: No channel-devices OBP node.\n"); + goto out_release; + } + + compat = mdesc_get_property(hp, root, "compatible", &len); + if (!compat) { + printk(KERN_ERR "VIO: Channel devices lacks compatible " + "property\n"); + goto out_release; + } + if (!find_in_proplist(compat, channel_devices_compat, len)) { + printk(KERN_ERR "VIO: Channel devices node lacks (%s) " + "compat entry.\n", channel_devices_compat); + goto out_release; + } + + cfg_handle = mdesc_get_property(hp, root, cfg_handle_prop, NULL); + if (!cfg_handle) { + printk(KERN_ERR "VIO: Channel devices lacks %s property\n", + cfg_handle_prop); + goto out_release; + } + + cdev_cfg_handle = *cfg_handle; + + create_devices(hp, root); + + mdesc_release(hp); + + return 0; + +out_release: + mdesc_release(hp); + return err; +} + +postcore_initcall(vio_init); diff --git a/arch/sparc64/kernel/viohs.c b/arch/sparc64/kernel/viohs.c new file mode 100644 index 000000000000..15613add45d1 --- /dev/null +++ b/arch/sparc64/kernel/viohs.c @@ -0,0 +1,792 @@ +/* viohs.c: LDOM Virtual I/O handshake helper layer. + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/ldc.h> +#include <asm/vio.h> + +int vio_ldc_send(struct vio_driver_state *vio, void *data, int len) +{ + int err, limit = 1000; + + err = -EINVAL; + while (limit-- > 0) { + err = ldc_write(vio->lp, data, len); + if (!err || (err != -EAGAIN)) + break; + udelay(1); + } + + return err; +} +EXPORT_SYMBOL(vio_ldc_send); + +static int send_ctrl(struct vio_driver_state *vio, + struct vio_msg_tag *tag, int len) +{ + tag->sid = vio_send_sid(vio); + return vio_ldc_send(vio, tag, len); +} + +static void init_tag(struct vio_msg_tag *tag, u8 type, u8 stype, u16 stype_env) +{ + tag->type = type; + tag->stype = stype; + tag->stype_env = stype_env; +} + +static int send_version(struct vio_driver_state *vio, u16 major, u16 minor) +{ + struct vio_ver_info pkt; + + vio->_local_sid = (u32) sched_clock(); + + memset(&pkt, 0, sizeof(pkt)); + init_tag(&pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_VER_INFO); + pkt.major = major; + pkt.minor = minor; + pkt.dev_class = vio->dev_class; + + viodbg(HS, "SEND VERSION INFO maj[%u] min[%u] devclass[%u]\n", + major, minor, vio->dev_class); + + return send_ctrl(vio, &pkt.tag, sizeof(pkt)); +} + +static int start_handshake(struct vio_driver_state *vio) +{ + int err; + + viodbg(HS, "START HANDSHAKE\n"); + + vio->hs_state = VIO_HS_INVALID; + + err = send_version(vio, + vio->ver_table[0].major, + vio->ver_table[0].minor); + if (err < 0) + return err; + + return 0; +} + +void vio_link_state_change(struct vio_driver_state *vio, int event) +{ + if (event == LDC_EVENT_UP) { + vio->hs_state = VIO_HS_INVALID; + + switch (vio->dev_class) { + case VDEV_NETWORK: + case VDEV_NETWORK_SWITCH: + vio->dr_state = (VIO_DR_STATE_TXREQ | + VIO_DR_STATE_RXREQ); + break; + + case VDEV_DISK: + vio->dr_state = VIO_DR_STATE_TXREQ; + break; + case VDEV_DISK_SERVER: + vio->dr_state = VIO_DR_STATE_RXREQ; + break; + } + start_handshake(vio); + } +} +EXPORT_SYMBOL(vio_link_state_change); + +static int handshake_failure(struct vio_driver_state *vio) +{ + struct vio_dring_state *dr; + + /* XXX Put policy here... Perhaps start a timer to fire + * XXX in 100 ms, which will bring the link up and retry + * XXX the handshake. + */ + + viodbg(HS, "HANDSHAKE FAILURE\n"); + + vio->dr_state &= ~(VIO_DR_STATE_TXREG | + VIO_DR_STATE_RXREG); + + dr = &vio->drings[VIO_DRIVER_RX_RING]; + memset(dr, 0, sizeof(*dr)); + + kfree(vio->desc_buf); + vio->desc_buf = NULL; + vio->desc_buf_len = 0; + + vio->hs_state = VIO_HS_INVALID; + + return -ECONNRESET; +} + +static int process_unknown(struct vio_driver_state *vio, void *arg) +{ + struct vio_msg_tag *pkt = arg; + + viodbg(HS, "UNKNOWN CONTROL [%02x:%02x:%04x:%08x]\n", + pkt->type, pkt->stype, pkt->stype_env, pkt->sid); + + printk(KERN_ERR "vio: ID[%lu] Resetting connection.\n", + vio->vdev->channel_id); + + ldc_disconnect(vio->lp); + + return -ECONNRESET; +} + +static int send_dreg(struct vio_driver_state *vio) +{ + struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_TX_RING]; + union { + struct vio_dring_register pkt; + char all[sizeof(struct vio_dring_register) + + (sizeof(struct ldc_trans_cookie) * + dr->ncookies)]; + } u; + int i; + + memset(&u, 0, sizeof(u)); + init_tag(&u.pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_DRING_REG); + u.pkt.dring_ident = 0; + u.pkt.num_descr = dr->num_entries; + u.pkt.descr_size = dr->entry_size; + u.pkt.options = VIO_TX_DRING; + u.pkt.num_cookies = dr->ncookies; + + viodbg(HS, "SEND DRING_REG INFO ndesc[%u] dsz[%u] opt[0x%x] " + "ncookies[%u]\n", + u.pkt.num_descr, u.pkt.descr_size, u.pkt.options, + u.pkt.num_cookies); + + for (i = 0; i < dr->ncookies; i++) { + u.pkt.cookies[i] = dr->cookies[i]; + + viodbg(HS, "DRING COOKIE(%d) [%016llx:%016llx]\n", + i, + (unsigned long long) u.pkt.cookies[i].cookie_addr, + (unsigned long long) u.pkt.cookies[i].cookie_size); + } + + return send_ctrl(vio, &u.pkt.tag, sizeof(u)); +} + +static int send_rdx(struct vio_driver_state *vio) +{ + struct vio_rdx pkt; + + memset(&pkt, 0, sizeof(pkt)); + + init_tag(&pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX); + + viodbg(HS, "SEND RDX INFO\n"); + + return send_ctrl(vio, &pkt.tag, sizeof(pkt)); +} + +static int send_attr(struct vio_driver_state *vio) +{ + return vio->ops->send_attr(vio); +} + +static struct vio_version *find_by_major(struct vio_driver_state *vio, + u16 major) +{ + struct vio_version *ret = NULL; + int i; + + for (i = 0; i < vio->ver_table_entries; i++) { + struct vio_version *v = &vio->ver_table[i]; + if (v->major <= major) { + ret = v; + break; + } + } + return ret; +} + +static int process_ver_info(struct vio_driver_state *vio, + struct vio_ver_info *pkt) +{ + struct vio_version *vap; + int err; + + viodbg(HS, "GOT VERSION INFO maj[%u] min[%u] devclass[%u]\n", + pkt->major, pkt->minor, pkt->dev_class); + + if (vio->hs_state != VIO_HS_INVALID) { + /* XXX Perhaps invoke start_handshake? XXX */ + memset(&vio->ver, 0, sizeof(vio->ver)); + vio->hs_state = VIO_HS_INVALID; + } + + vap = find_by_major(vio, pkt->major); + + vio->_peer_sid = pkt->tag.sid; + + if (!vap) { + pkt->tag.stype = VIO_SUBTYPE_NACK; + pkt->major = 0; + pkt->minor = 0; + viodbg(HS, "SEND VERSION NACK maj[0] min[0]\n"); + err = send_ctrl(vio, &pkt->tag, sizeof(*pkt)); + } else if (vap->major != pkt->major) { + pkt->tag.stype = VIO_SUBTYPE_NACK; + pkt->major = vap->major; + pkt->minor = vap->minor; + viodbg(HS, "SEND VERSION NACK maj[%u] min[%u]\n", + pkt->major, pkt->minor); + err = send_ctrl(vio, &pkt->tag, sizeof(*pkt)); + } else { + struct vio_version ver = { + .major = pkt->major, + .minor = pkt->minor, + }; + if (ver.minor > vap->minor) + ver.minor = vap->minor; + pkt->minor = ver.minor; + pkt->tag.stype = VIO_SUBTYPE_ACK; + viodbg(HS, "SEND VERSION ACK maj[%u] min[%u]\n", + pkt->major, pkt->minor); + err = send_ctrl(vio, &pkt->tag, sizeof(*pkt)); + if (err > 0) { + vio->ver = ver; + vio->hs_state = VIO_HS_GOTVERS; + } + } + if (err < 0) + return handshake_failure(vio); + + return 0; +} + +static int process_ver_ack(struct vio_driver_state *vio, + struct vio_ver_info *pkt) +{ + viodbg(HS, "GOT VERSION ACK maj[%u] min[%u] devclass[%u]\n", + pkt->major, pkt->minor, pkt->dev_class); + + if (vio->hs_state & VIO_HS_GOTVERS) { + if (vio->ver.major != pkt->major || + vio->ver.minor != pkt->minor) { + pkt->tag.stype = VIO_SUBTYPE_NACK; + (void) send_ctrl(vio, &pkt->tag, sizeof(*pkt)); + return handshake_failure(vio); + } + } else { + vio->ver.major = pkt->major; + vio->ver.minor = pkt->minor; + vio->hs_state = VIO_HS_GOTVERS; + } + + switch (vio->dev_class) { + case VDEV_NETWORK: + case VDEV_DISK: + if (send_attr(vio) < 0) + return handshake_failure(vio); + break; + + default: + break; + } + + return 0; +} + +static int process_ver_nack(struct vio_driver_state *vio, + struct vio_ver_info *pkt) +{ + struct vio_version *nver; + + viodbg(HS, "GOT VERSION NACK maj[%u] min[%u] devclass[%u]\n", + pkt->major, pkt->minor, pkt->dev_class); + + if ((pkt->major == 0 && pkt->minor == 0) || + !(nver = find_by_major(vio, pkt->major))) + return handshake_failure(vio); + + if (send_version(vio, nver->major, nver->minor) < 0) + return handshake_failure(vio); + + return 0; +} + +static int process_ver(struct vio_driver_state *vio, struct vio_ver_info *pkt) +{ + switch (pkt->tag.stype) { + case VIO_SUBTYPE_INFO: + return process_ver_info(vio, pkt); + + case VIO_SUBTYPE_ACK: + return process_ver_ack(vio, pkt); + + case VIO_SUBTYPE_NACK: + return process_ver_nack(vio, pkt); + + default: + return handshake_failure(vio); + }; +} + +static int process_attr(struct vio_driver_state *vio, void *pkt) +{ + int err; + + if (!(vio->hs_state & VIO_HS_GOTVERS)) + return handshake_failure(vio); + + err = vio->ops->handle_attr(vio, pkt); + if (err < 0) { + return handshake_failure(vio); + } else { + vio->hs_state |= VIO_HS_GOT_ATTR; + + if ((vio->dr_state & VIO_DR_STATE_TXREQ) && + !(vio->hs_state & VIO_HS_SENT_DREG)) { + if (send_dreg(vio) < 0) + return handshake_failure(vio); + + vio->hs_state |= VIO_HS_SENT_DREG; + } + } + return 0; +} + +static int all_drings_registered(struct vio_driver_state *vio) +{ + int need_rx, need_tx; + + need_rx = (vio->dr_state & VIO_DR_STATE_RXREQ); + need_tx = (vio->dr_state & VIO_DR_STATE_TXREQ); + + if (need_rx && + !(vio->dr_state & VIO_DR_STATE_RXREG)) + return 0; + + if (need_tx && + !(vio->dr_state & VIO_DR_STATE_TXREG)) + return 0; + + return 1; +} + +static int process_dreg_info(struct vio_driver_state *vio, + struct vio_dring_register *pkt) +{ + struct vio_dring_state *dr; + int i, len; + + viodbg(HS, "GOT DRING_REG INFO ident[%llx] " + "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n", + (unsigned long long) pkt->dring_ident, + pkt->num_descr, pkt->descr_size, pkt->options, + pkt->num_cookies); + + if (!(vio->dr_state & VIO_DR_STATE_RXREQ)) + goto send_nack; + + if (vio->dr_state & VIO_DR_STATE_RXREG) + goto send_nack; + + vio->desc_buf = kzalloc(pkt->descr_size, GFP_ATOMIC); + if (!vio->desc_buf) + goto send_nack; + + vio->desc_buf_len = pkt->descr_size; + + dr = &vio->drings[VIO_DRIVER_RX_RING]; + + dr->num_entries = pkt->num_descr; + dr->entry_size = pkt->descr_size; + dr->ncookies = pkt->num_cookies; + for (i = 0; i < dr->ncookies; i++) { + dr->cookies[i] = pkt->cookies[i]; + + viodbg(HS, "DRING COOKIE(%d) [%016llx:%016llx]\n", + i, + (unsigned long long) + pkt->cookies[i].cookie_addr, + (unsigned long long) + pkt->cookies[i].cookie_size); + } + + pkt->tag.stype = VIO_SUBTYPE_ACK; + pkt->dring_ident = ++dr->ident; + + viodbg(HS, "SEND DRING_REG ACK ident[%llx]\n", + (unsigned long long) pkt->dring_ident); + + len = (sizeof(*pkt) + + (dr->ncookies * sizeof(struct ldc_trans_cookie))); + if (send_ctrl(vio, &pkt->tag, len) < 0) + goto send_nack; + + vio->dr_state |= VIO_DR_STATE_RXREG; + + return 0; + +send_nack: + pkt->tag.stype = VIO_SUBTYPE_NACK; + viodbg(HS, "SEND DRING_REG NACK\n"); + (void) send_ctrl(vio, &pkt->tag, sizeof(*pkt)); + + return handshake_failure(vio); +} + +static int process_dreg_ack(struct vio_driver_state *vio, + struct vio_dring_register *pkt) +{ + struct vio_dring_state *dr; + + viodbg(HS, "GOT DRING_REG ACK ident[%llx] " + "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n", + (unsigned long long) pkt->dring_ident, + pkt->num_descr, pkt->descr_size, pkt->options, + pkt->num_cookies); + + dr = &vio->drings[VIO_DRIVER_TX_RING]; + + if (!(vio->dr_state & VIO_DR_STATE_TXREQ)) + return handshake_failure(vio); + + dr->ident = pkt->dring_ident; + vio->dr_state |= VIO_DR_STATE_TXREG; + + if (all_drings_registered(vio)) { + if (send_rdx(vio) < 0) + return handshake_failure(vio); + vio->hs_state = VIO_HS_SENT_RDX; + } + return 0; +} + +static int process_dreg_nack(struct vio_driver_state *vio, + struct vio_dring_register *pkt) +{ + viodbg(HS, "GOT DRING_REG NACK ident[%llx] " + "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n", + (unsigned long long) pkt->dring_ident, + pkt->num_descr, pkt->descr_size, pkt->options, + pkt->num_cookies); + + return handshake_failure(vio); +} + +static int process_dreg(struct vio_driver_state *vio, + struct vio_dring_register *pkt) +{ + if (!(vio->hs_state & VIO_HS_GOTVERS)) + return handshake_failure(vio); + + switch (pkt->tag.stype) { + case VIO_SUBTYPE_INFO: + return process_dreg_info(vio, pkt); + + case VIO_SUBTYPE_ACK: + return process_dreg_ack(vio, pkt); + + case VIO_SUBTYPE_NACK: + return process_dreg_nack(vio, pkt); + + default: + return handshake_failure(vio); + } +} + +static int process_dunreg(struct vio_driver_state *vio, + struct vio_dring_unregister *pkt) +{ + struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_RX_RING]; + + viodbg(HS, "GOT DRING_UNREG\n"); + + if (pkt->dring_ident != dr->ident) + return 0; + + vio->dr_state &= ~VIO_DR_STATE_RXREG; + + memset(dr, 0, sizeof(*dr)); + + kfree(vio->desc_buf); + vio->desc_buf = NULL; + vio->desc_buf_len = 0; + + return 0; +} + +static int process_rdx_info(struct vio_driver_state *vio, struct vio_rdx *pkt) +{ + viodbg(HS, "GOT RDX INFO\n"); + + pkt->tag.stype = VIO_SUBTYPE_ACK; + viodbg(HS, "SEND RDX ACK\n"); + if (send_ctrl(vio, &pkt->tag, sizeof(*pkt)) < 0) + return handshake_failure(vio); + + vio->hs_state |= VIO_HS_SENT_RDX_ACK; + return 0; +} + +static int process_rdx_ack(struct vio_driver_state *vio, struct vio_rdx *pkt) +{ + viodbg(HS, "GOT RDX ACK\n"); + + if (!(vio->hs_state & VIO_HS_SENT_RDX)) + return handshake_failure(vio); + + vio->hs_state |= VIO_HS_GOT_RDX_ACK; + return 0; +} + +static int process_rdx_nack(struct vio_driver_state *vio, struct vio_rdx *pkt) +{ + viodbg(HS, "GOT RDX NACK\n"); + + return handshake_failure(vio); +} + +static int process_rdx(struct vio_driver_state *vio, struct vio_rdx *pkt) +{ + if (!all_drings_registered(vio)) + handshake_failure(vio); + + switch (pkt->tag.stype) { + case VIO_SUBTYPE_INFO: + return process_rdx_info(vio, pkt); + + case VIO_SUBTYPE_ACK: + return process_rdx_ack(vio, pkt); + + case VIO_SUBTYPE_NACK: + return process_rdx_nack(vio, pkt); + + default: + return handshake_failure(vio); + } +} + +int vio_control_pkt_engine(struct vio_driver_state *vio, void *pkt) +{ + struct vio_msg_tag *tag = pkt; + u8 prev_state = vio->hs_state; + int err; + + switch (tag->stype_env) { + case VIO_VER_INFO: + err = process_ver(vio, pkt); + break; + + case VIO_ATTR_INFO: + err = process_attr(vio, pkt); + break; + + case VIO_DRING_REG: + err = process_dreg(vio, pkt); + break; + + case VIO_DRING_UNREG: + err = process_dunreg(vio, pkt); + break; + + case VIO_RDX: + err = process_rdx(vio, pkt); + break; + + default: + err = process_unknown(vio, pkt); + break; + } + if (!err && + vio->hs_state != prev_state && + (vio->hs_state & VIO_HS_COMPLETE)) + vio->ops->handshake_complete(vio); + + return err; +} +EXPORT_SYMBOL(vio_control_pkt_engine); + +void vio_conn_reset(struct vio_driver_state *vio) +{ +} +EXPORT_SYMBOL(vio_conn_reset); + +/* The issue is that the Solaris virtual disk server just mirrors the + * SID values it gets from the client peer. So we work around that + * here in vio_{validate,send}_sid() so that the drivers don't need + * to be aware of this crap. + */ +int vio_validate_sid(struct vio_driver_state *vio, struct vio_msg_tag *tp) +{ + u32 sid; + + /* Always let VERSION+INFO packets through unchecked, they + * define the new SID. + */ + if (tp->type == VIO_TYPE_CTRL && + tp->stype == VIO_SUBTYPE_INFO && + tp->stype_env == VIO_VER_INFO) + return 0; + + /* Ok, now figure out which SID to use. */ + switch (vio->dev_class) { + case VDEV_NETWORK: + case VDEV_NETWORK_SWITCH: + case VDEV_DISK_SERVER: + default: + sid = vio->_peer_sid; + break; + + case VDEV_DISK: + sid = vio->_local_sid; + break; + } + + if (sid == tp->sid) + return 0; + viodbg(DATA, "BAD SID tag->sid[%08x] peer_sid[%08x] local_sid[%08x]\n", + tp->sid, vio->_peer_sid, vio->_local_sid); + return -EINVAL; +} +EXPORT_SYMBOL(vio_validate_sid); + +u32 vio_send_sid(struct vio_driver_state *vio) +{ + switch (vio->dev_class) { + case VDEV_NETWORK: + case VDEV_NETWORK_SWITCH: + case VDEV_DISK: + default: + return vio->_local_sid; + + case VDEV_DISK_SERVER: + return vio->_peer_sid; + } +} +EXPORT_SYMBOL(vio_send_sid); + +extern int vio_ldc_alloc(struct vio_driver_state *vio, + struct ldc_channel_config *base_cfg, + void *event_arg) +{ + struct ldc_channel_config cfg = *base_cfg; + struct ldc_channel *lp; + + cfg.tx_irq = vio->vdev->tx_irq; + cfg.rx_irq = vio->vdev->rx_irq; + + lp = ldc_alloc(vio->vdev->channel_id, &cfg, event_arg); + if (IS_ERR(lp)) + return PTR_ERR(lp); + + vio->lp = lp; + + return 0; +} +EXPORT_SYMBOL(vio_ldc_alloc); + +void vio_ldc_free(struct vio_driver_state *vio) +{ + ldc_free(vio->lp); + vio->lp = NULL; + + kfree(vio->desc_buf); + vio->desc_buf = NULL; + vio->desc_buf_len = 0; +} +EXPORT_SYMBOL(vio_ldc_free); + +void vio_port_up(struct vio_driver_state *vio) +{ + unsigned long flags; + int err, state; + + spin_lock_irqsave(&vio->lock, flags); + + state = ldc_state(vio->lp); + + err = 0; + if (state == LDC_STATE_INIT) { + err = ldc_bind(vio->lp, vio->name); + if (err) + printk(KERN_WARNING "%s: Port %lu bind failed, " + "err=%d\n", + vio->name, vio->vdev->channel_id, err); + } + + if (!err) { + err = ldc_connect(vio->lp); + if (err) + printk(KERN_WARNING "%s: Port %lu connect failed, " + "err=%d\n", + vio->name, vio->vdev->channel_id, err); + } + if (err) { + unsigned long expires = jiffies + HZ; + + expires = round_jiffies(expires); + mod_timer(&vio->timer, expires); + } + + spin_unlock_irqrestore(&vio->lock, flags); +} +EXPORT_SYMBOL(vio_port_up); + +static void vio_port_timer(unsigned long _arg) +{ + struct vio_driver_state *vio = (struct vio_driver_state *) _arg; + + vio_port_up(vio); +} + +int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev, + u8 dev_class, struct vio_version *ver_table, + int ver_table_size, struct vio_driver_ops *ops, + char *name) +{ + switch (dev_class) { + case VDEV_NETWORK: + case VDEV_NETWORK_SWITCH: + case VDEV_DISK: + case VDEV_DISK_SERVER: + break; + + default: + return -EINVAL; + } + + if (!ops->send_attr || + !ops->handle_attr || + !ops->handshake_complete) + return -EINVAL; + + if (!ver_table || ver_table_size < 0) + return -EINVAL; + + if (!name) + return -EINVAL; + + spin_lock_init(&vio->lock); + + vio->name = name; + + vio->dev_class = dev_class; + vio->vdev = vdev; + + vio->ver_table = ver_table; + vio->ver_table_entries = ver_table_size; + + vio->ops = ops; + + setup_timer(&vio->timer, vio_port_timer, (unsigned long) vio); + + return 0; +} +EXPORT_SYMBOL(vio_driver_init); diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile index 4a725d8985f1..c4a6d6e7d03c 100644 --- a/arch/sparc64/lib/Makefile +++ b/arch/sparc64/lib/Makefile @@ -14,6 +14,6 @@ lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \ NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o NGpatch.o \ NGpage.o NGbzero.o \ copy_in_user.o user_fixup.o memmove.o \ - mcount.o ipcsum.o rwsem.o xor.o delay.o + mcount.o ipcsum.o rwsem.o xor.o obj-y += iomap.o diff --git a/arch/sparc64/lib/delay.c b/arch/sparc64/lib/delay.c deleted file mode 100644 index fb27e54a03ee..000000000000 --- a/arch/sparc64/lib/delay.c +++ /dev/null @@ -1,46 +0,0 @@ -/* delay.c: Delay loops for sparc64 - * - * Copyright (C) 2004, 2006 David S. Miller <davem@davemloft.net> - * - * Based heavily upon x86 variant which is: - * Copyright (C) 1993 Linus Torvalds - * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> - */ - -#include <linux/delay.h> -#include <asm/timer.h> - -void __delay(unsigned long loops) -{ - unsigned long bclock, now; - - bclock = tick_ops->get_tick(); - do { - now = tick_ops->get_tick(); - } while ((now-bclock) < loops); -} - -/* We used to multiply by HZ after shifting down by 32 bits - * but that runs into problems for higher values of HZ and - * slow cpus. - */ -void __const_udelay(unsigned long n) -{ - n *= 4; - - n *= (cpu_data(raw_smp_processor_id()).udelay_val * (HZ/4)); - n >>= 32; - - __delay(n + 1); -} - -void __udelay(unsigned long n) -{ - __const_udelay(n * 0x10c7UL); -} - - -void __ndelay(unsigned long n) -{ - __const_udelay(n * 0x5UL); -} diff --git a/arch/sparc64/prom/misc.c b/arch/sparc64/prom/misc.c index f3e0c14e9eef..33c5b7da31e5 100644 --- a/arch/sparc64/prom/misc.c +++ b/arch/sparc64/prom/misc.c @@ -14,6 +14,7 @@ #include <asm/openprom.h> #include <asm/oplib.h> #include <asm/system.h> +#include <asm/ldc.h> int prom_service_exists(const char *service_name) { @@ -37,6 +38,10 @@ void prom_sun4v_guest_soft_state(void) /* Reset and reboot the machine with the command 'bcommand'. */ void prom_reboot(const char *bcommand) { +#ifdef CONFIG_SUN_LDOMS + if (ldom_domaining_enabled) + ldom_reboot(bcommand); +#endif p1275_cmd("boot", P1275_ARG(0, P1275_ARG_IN_STRING) | P1275_INOUT(1, 0), bcommand); } @@ -91,6 +96,10 @@ void prom_cmdline(void) */ void prom_halt(void) { +#ifdef CONFIG_SUN_LDOMS + if (ldom_domaining_enabled) + ldom_power_off(); +#endif again: p1275_cmd("exit", P1275_INOUT(0, 0)); goto again; /* PROM is out to get me -DaveM */ @@ -98,6 +107,10 @@ again: void prom_halt_power_off(void) { +#ifdef CONFIG_SUN_LDOMS + if (ldom_domaining_enabled) + ldom_power_off(); +#endif p1275_cmd("SUNW,power-off", P1275_INOUT(0, 0)); /* if nothing else helps, we just halt */ diff --git a/arch/sparc64/prom/p1275.c b/arch/sparc64/prom/p1275.c index 2b32c489860c..7fcccc0e19cf 100644 --- a/arch/sparc64/prom/p1275.c +++ b/arch/sparc64/prom/p1275.c @@ -16,6 +16,7 @@ #include <asm/system.h> #include <asm/spitfire.h> #include <asm/pstate.h> +#include <asm/ldc.h> struct { long prom_callback; /* 0x00 */ diff --git a/arch/sparc64/prom/tree.c b/arch/sparc64/prom/tree.c index 500f05e2cfcb..17b7ecfe7ca9 100644 --- a/arch/sparc64/prom/tree.c +++ b/arch/sparc64/prom/tree.c @@ -13,6 +13,7 @@ #include <asm/openprom.h> #include <asm/oplib.h> +#include <asm/ldc.h> /* Return the child of node 'node' or zero if no this node has no * direct descendent. @@ -261,9 +262,17 @@ int prom_node_has_property(int node, const char *prop) int prom_setprop(int node, const char *pname, char *value, int size) { - if(size == 0) return 0; - if((pname == 0) || (value == 0)) return 0; + if (size == 0) + return 0; + if ((pname == 0) || (value == 0)) + return 0; +#ifdef CONFIG_SUN_LDOMS + if (ldom_domaining_enabled) { + ldom_set_var(pname, value); + return 0; + } +#endif return p1275_cmd ("setprop", P1275_ARG(1,P1275_ARG_IN_STRING)| P1275_ARG(2,P1275_ARG_IN_BUF)| P1275_INOUT(4, 1), diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index fd7a53bdcb63..e49162b15578 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -423,6 +423,13 @@ config ATA_OVER_ETH This driver provides Support for ATA over Ethernet block devices like the Coraid EtherDrive (R) Storage Blade. +config SUNVDC + tristate "Sun Virtual Disk Client support" + depends on SUN_LDOMS + help + Support for virtual disk devices as a client under Sun + Logical Domains. + source "drivers/s390/block/Kconfig" endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index e5f98acc5d52..43371c59623e 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o +obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_UMEM) += umem.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c new file mode 100644 index 000000000000..0f5e3caf85d7 --- /dev/null +++ b/drivers/block/sunvdc.c @@ -0,0 +1,972 @@ +/* sunvdc.c: Sun LDOM Virtual Disk Client. + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/blkdev.h> +#include <linux/hdreg.h> +#include <linux/genhd.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/list.h> + +#include <asm/vio.h> +#include <asm/ldc.h> + +#define DRV_MODULE_NAME "sunvdc" +#define PFX DRV_MODULE_NAME ": " +#define DRV_MODULE_VERSION "1.0" +#define DRV_MODULE_RELDATE "June 25, 2007" + +static char version[] __devinitdata = + DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; +MODULE_AUTHOR("David S. Miller (davem@davemloft.net)"); +MODULE_DESCRIPTION("Sun LDOM virtual disk client driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_MODULE_VERSION); + +#define VDC_TX_RING_SIZE 256 + +#define WAITING_FOR_LINK_UP 0x01 +#define WAITING_FOR_TX_SPACE 0x02 +#define WAITING_FOR_GEN_CMD 0x04 +#define WAITING_FOR_ANY -1 + +struct vdc_req_entry { + struct request *req; +}; + +struct vdc_port { + struct vio_driver_state vio; + + struct vdc *vp; + + struct gendisk *disk; + + struct vdc_completion *cmp; + + u64 req_id; + u64 seq; + struct vdc_req_entry rq_arr[VDC_TX_RING_SIZE]; + + unsigned long ring_cookies; + + u64 max_xfer_size; + u32 vdisk_block_size; + + /* The server fills these in for us in the disk attribute + * ACK packet. + */ + u64 operations; + u32 vdisk_size; + u8 vdisk_type; + u8 dev_no; + + char disk_name[32]; + + struct vio_disk_geom geom; + struct vio_disk_vtoc label; + + struct list_head list; +}; + +static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) +{ + return container_of(vio, struct vdc_port, vio); +} + +struct vdc { + /* Protects prot_list. */ + spinlock_t lock; + + struct vio_dev *dev; + + struct list_head port_list; +}; + +/* Ordered from largest major to lowest */ +static struct vio_version vdc_versions[] = { + { .major = 1, .minor = 0 }, +}; + +#define VDCBLK_NAME "vdisk" +static int vdc_major; +#define PARTITION_SHIFT 3 + +static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr) +{ + return vio_dring_avail(dr, VDC_TX_RING_SIZE); +} + +static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct gendisk *disk = bdev->bd_disk; + struct vdc_port *port = disk->private_data; + + geo->heads = (u8) port->geom.num_hd; + geo->sectors = (u8) port->geom.num_sec; + geo->cylinders = port->geom.num_cyl; + + return 0; +} + +static struct block_device_operations vdc_fops = { + .owner = THIS_MODULE, + .getgeo = vdc_getgeo, +}; + +static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) +{ + if (vio->cmp && + (waiting_for == -1 || + vio->cmp->waiting_for == waiting_for)) { + vio->cmp->err = err; + complete(&vio->cmp->com); + vio->cmp = NULL; + } +} + +static void vdc_handshake_complete(struct vio_driver_state *vio) +{ + vdc_finish(vio, 0, WAITING_FOR_LINK_UP); +} + +static int vdc_handle_unknown(struct vdc_port *port, void *arg) +{ + struct vio_msg_tag *pkt = arg; + + printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n", + pkt->type, pkt->stype, pkt->stype_env, pkt->sid); + printk(KERN_ERR PFX "Resetting connection.\n"); + + ldc_disconnect(port->vio.lp); + + return -ECONNRESET; +} + +static int vdc_send_attr(struct vio_driver_state *vio) +{ + struct vdc_port *port = to_vdc_port(vio); + struct vio_disk_attr_info pkt; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.tag.type = VIO_TYPE_CTRL; + pkt.tag.stype = VIO_SUBTYPE_INFO; + pkt.tag.stype_env = VIO_ATTR_INFO; + pkt.tag.sid = vio_send_sid(vio); + + pkt.xfer_mode = VIO_DRING_MODE; + pkt.vdisk_block_size = port->vdisk_block_size; + pkt.max_xfer_size = port->max_xfer_size; + + viodbg(HS, "SEND ATTR xfer_mode[0x%x] blksz[%u] max_xfer[%lu]\n", + pkt.xfer_mode, pkt.vdisk_block_size, pkt.max_xfer_size); + + return vio_ldc_send(&port->vio, &pkt, sizeof(pkt)); +} + +static int vdc_handle_attr(struct vio_driver_state *vio, void *arg) +{ + struct vdc_port *port = to_vdc_port(vio); + struct vio_disk_attr_info *pkt = arg; + + viodbg(HS, "GOT ATTR stype[0x%x] ops[%lx] disk_size[%lu] disk_type[%x] " + "xfer_mode[0x%x] blksz[%u] max_xfer[%lu]\n", + pkt->tag.stype, pkt->operations, + pkt->vdisk_size, pkt->vdisk_type, + pkt->xfer_mode, pkt->vdisk_block_size, + pkt->max_xfer_size); + + if (pkt->tag.stype == VIO_SUBTYPE_ACK) { + switch (pkt->vdisk_type) { + case VD_DISK_TYPE_DISK: + case VD_DISK_TYPE_SLICE: + break; + + default: + printk(KERN_ERR PFX "%s: Bogus vdisk_type 0x%x\n", + vio->name, pkt->vdisk_type); + return -ECONNRESET; + } + + if (pkt->vdisk_block_size > port->vdisk_block_size) { + printk(KERN_ERR PFX "%s: BLOCK size increased " + "%u --> %u\n", + vio->name, + port->vdisk_block_size, pkt->vdisk_block_size); + return -ECONNRESET; + } + + port->operations = pkt->operations; + port->vdisk_size = pkt->vdisk_size; + port->vdisk_type = pkt->vdisk_type; + if (pkt->max_xfer_size < port->max_xfer_size) + port->max_xfer_size = pkt->max_xfer_size; + port->vdisk_block_size = pkt->vdisk_block_size; + return 0; + } else { + printk(KERN_ERR PFX "%s: Attribute NACK\n", vio->name); + + return -ECONNRESET; + } +} + +static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc) +{ + int err = desc->status; + + vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD); +} + +static void vdc_end_request(struct request *req, int uptodate, int num_sectors) +{ + if (end_that_request_first(req, uptodate, num_sectors)) + return; + add_disk_randomness(req->rq_disk); + end_that_request_last(req, uptodate); +} + +static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, + unsigned int index) +{ + struct vio_disk_desc *desc = vio_dring_entry(dr, index); + struct vdc_req_entry *rqe = &port->rq_arr[index]; + struct request *req; + + if (unlikely(desc->hdr.state != VIO_DESC_DONE)) + return; + + ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies); + desc->hdr.state = VIO_DESC_FREE; + dr->cons = (index + 1) & (VDC_TX_RING_SIZE - 1); + + req = rqe->req; + if (req == NULL) { + vdc_end_special(port, desc); + return; + } + + rqe->req = NULL; + + vdc_end_request(req, !desc->status, desc->size >> 9); + + if (blk_queue_stopped(port->disk->queue)) + blk_start_queue(port->disk->queue); +} + +static int vdc_ack(struct vdc_port *port, void *msgbuf) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct vio_dring_data *pkt = msgbuf; + + if (unlikely(pkt->dring_ident != dr->ident || + pkt->start_idx != pkt->end_idx || + pkt->start_idx >= VDC_TX_RING_SIZE)) + return 0; + + vdc_end_one(port, dr, pkt->start_idx); + + return 0; +} + +static int vdc_nack(struct vdc_port *port, void *msgbuf) +{ + /* XXX Implement me XXX */ + return 0; +} + +static void vdc_event(void *arg, int event) +{ + struct vdc_port *port = arg; + struct vio_driver_state *vio = &port->vio; + unsigned long flags; + int err; + + spin_lock_irqsave(&vio->lock, flags); + + if (unlikely(event == LDC_EVENT_RESET || + event == LDC_EVENT_UP)) { + vio_link_state_change(vio, event); + spin_unlock_irqrestore(&vio->lock, flags); + return; + } + + if (unlikely(event != LDC_EVENT_DATA_READY)) { + printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event); + spin_unlock_irqrestore(&vio->lock, flags); + return; + } + + err = 0; + while (1) { + union { + struct vio_msg_tag tag; + u64 raw[8]; + } msgbuf; + + err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf)); + if (unlikely(err < 0)) { + if (err == -ECONNRESET) + vio_conn_reset(vio); + break; + } + if (err == 0) + break; + viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n", + msgbuf.tag.type, + msgbuf.tag.stype, + msgbuf.tag.stype_env, + msgbuf.tag.sid); + err = vio_validate_sid(vio, &msgbuf.tag); + if (err < 0) + break; + + if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) { + if (msgbuf.tag.stype == VIO_SUBTYPE_ACK) + err = vdc_ack(port, &msgbuf); + else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK) + err = vdc_nack(port, &msgbuf); + else + err = vdc_handle_unknown(port, &msgbuf); + } else if (msgbuf.tag.type == VIO_TYPE_CTRL) { + err = vio_control_pkt_engine(vio, &msgbuf); + } else { + err = vdc_handle_unknown(port, &msgbuf); + } + if (err < 0) + break; + } + if (err < 0) + vdc_finish(&port->vio, err, WAITING_FOR_ANY); + spin_unlock_irqrestore(&vio->lock, flags); +} + +static int __vdc_tx_trigger(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct vio_dring_data hdr = { + .tag = { + .type = VIO_TYPE_DATA, + .stype = VIO_SUBTYPE_INFO, + .stype_env = VIO_DRING_DATA, + .sid = vio_send_sid(&port->vio), + }, + .dring_ident = dr->ident, + .start_idx = dr->prod, + .end_idx = dr->prod, + }; + int err, delay; + + hdr.seq = dr->snd_nxt; + delay = 1; + do { + err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr)); + if (err > 0) { + dr->snd_nxt++; + break; + } + udelay(delay); + if ((delay <<= 1) > 128) + delay = 128; + } while (err == -EAGAIN); + + return err; +} + +static int __send_request(struct request *req) +{ + struct vdc_port *port = req->rq_disk->private_data; + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct scatterlist sg[port->ring_cookies]; + struct vdc_req_entry *rqe; + struct vio_disk_desc *desc; + unsigned int map_perm; + int nsg, err, i; + u64 len; + u8 op; + + map_perm = LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO; + + if (rq_data_dir(req) == READ) { + map_perm |= LDC_MAP_W; + op = VD_OP_BREAD; + } else { + map_perm |= LDC_MAP_R; + op = VD_OP_BWRITE; + } + + nsg = blk_rq_map_sg(req->q, req, sg); + + len = 0; + for (i = 0; i < nsg; i++) + len += sg[i].length; + + if (unlikely(vdc_tx_dring_avail(dr) < 1)) { + blk_stop_queue(port->disk->queue); + err = -ENOMEM; + goto out; + } + + desc = vio_dring_cur(dr); + + err = ldc_map_sg(port->vio.lp, sg, nsg, + desc->cookies, port->ring_cookies, + map_perm); + if (err < 0) { + printk(KERN_ERR PFX "ldc_map_sg() failure, err=%d.\n", err); + return err; + } + + rqe = &port->rq_arr[dr->prod]; + rqe->req = req; + + desc->hdr.ack = VIO_ACK_ENABLE; + desc->req_id = port->req_id; + desc->operation = op; + if (port->vdisk_type == VD_DISK_TYPE_DISK) { + desc->slice = 2; + } else { + desc->slice = 0; + } + desc->status = ~0; + desc->offset = (req->sector << 9) / port->vdisk_block_size; + desc->size = len; + desc->ncookies = err; + + /* This has to be a non-SMP write barrier because we are writing + * to memory which is shared with the peer LDOM. + */ + wmb(); + desc->hdr.state = VIO_DESC_READY; + + err = __vdc_tx_trigger(port); + if (err < 0) { + printk(KERN_ERR PFX "vdc_tx_trigger() failure, err=%d\n", err); + } else { + port->req_id++; + dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1); + } +out: + + return err; +} + +static void do_vdc_request(request_queue_t *q) +{ + while (1) { + struct request *req = elv_next_request(q); + + if (!req) + break; + + blkdev_dequeue_request(req); + if (__send_request(req) < 0) + vdc_end_request(req, 0, req->hard_nr_sectors); + } +} + +static int generic_request(struct vdc_port *port, u8 op, void *buf, int len) +{ + struct vio_dring_state *dr; + struct vio_completion comp; + struct vio_disk_desc *desc; + unsigned int map_perm; + unsigned long flags; + int op_len, err; + void *req_buf; + + if (!(((u64)1 << ((u64)op - 1)) & port->operations)) + return -EOPNOTSUPP; + + switch (op) { + case VD_OP_BREAD: + case VD_OP_BWRITE: + default: + return -EINVAL; + + case VD_OP_FLUSH: + op_len = 0; + map_perm = 0; + break; + + case VD_OP_GET_WCE: + op_len = sizeof(u32); + map_perm = LDC_MAP_W; + break; + + case VD_OP_SET_WCE: + op_len = sizeof(u32); + map_perm = LDC_MAP_R; + break; + + case VD_OP_GET_VTOC: + op_len = sizeof(struct vio_disk_vtoc); + map_perm = LDC_MAP_W; + break; + + case VD_OP_SET_VTOC: + op_len = sizeof(struct vio_disk_vtoc); + map_perm = LDC_MAP_R; + break; + + case VD_OP_GET_DISKGEOM: + op_len = sizeof(struct vio_disk_geom); + map_perm = LDC_MAP_W; + break; + + case VD_OP_SET_DISKGEOM: + op_len = sizeof(struct vio_disk_geom); + map_perm = LDC_MAP_R; + break; + + case VD_OP_SCSICMD: + op_len = 16; + map_perm = LDC_MAP_RW; + break; + + case VD_OP_GET_DEVID: + op_len = sizeof(struct vio_disk_devid); + map_perm = LDC_MAP_W; + break; + + case VD_OP_GET_EFI: + case VD_OP_SET_EFI: + return -EOPNOTSUPP; + break; + }; + + map_perm |= LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO; + + op_len = (op_len + 7) & ~7; + req_buf = kzalloc(op_len, GFP_KERNEL); + if (!req_buf) + return -ENOMEM; + + if (len > op_len) + len = op_len; + + if (map_perm & LDC_MAP_R) + memcpy(req_buf, buf, len); + + spin_lock_irqsave(&port->vio.lock, flags); + + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + /* XXX If we want to use this code generically we have to + * XXX handle TX ring exhaustion etc. + */ + desc = vio_dring_cur(dr); + + err = ldc_map_single(port->vio.lp, req_buf, op_len, + desc->cookies, port->ring_cookies, + map_perm); + if (err < 0) { + spin_unlock_irqrestore(&port->vio.lock, flags); + kfree(req_buf); + return err; + } + + init_completion(&comp.com); + comp.waiting_for = WAITING_FOR_GEN_CMD; + port->vio.cmp = ∁ + + desc->hdr.ack = VIO_ACK_ENABLE; + desc->req_id = port->req_id; + desc->operation = op; + desc->slice = 0; + desc->status = ~0; + desc->offset = 0; + desc->size = op_len; + desc->ncookies = err; + + /* This has to be a non-SMP write barrier because we are writing + * to memory which is shared with the peer LDOM. + */ + wmb(); + desc->hdr.state = VIO_DESC_READY; + + err = __vdc_tx_trigger(port); + if (err >= 0) { + port->req_id++; + dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1); + spin_unlock_irqrestore(&port->vio.lock, flags); + + wait_for_completion(&comp.com); + err = comp.err; + } else { + port->vio.cmp = NULL; + spin_unlock_irqrestore(&port->vio.lock, flags); + } + + if (map_perm & LDC_MAP_W) + memcpy(buf, req_buf, len); + + kfree(req_buf); + + return err; +} + +static int __devinit vdc_alloc_tx_ring(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + unsigned long len, entry_size; + int ncookies; + void *dring; + + entry_size = sizeof(struct vio_disk_desc) + + (sizeof(struct ldc_trans_cookie) * port->ring_cookies); + len = (VDC_TX_RING_SIZE * entry_size); + + ncookies = VIO_MAX_RING_COOKIES; + dring = ldc_alloc_exp_dring(port->vio.lp, len, + dr->cookies, &ncookies, + (LDC_MAP_SHADOW | + LDC_MAP_DIRECT | + LDC_MAP_RW)); + if (IS_ERR(dring)) + return PTR_ERR(dring); + + dr->base = dring; + dr->entry_size = entry_size; + dr->num_entries = VDC_TX_RING_SIZE; + dr->prod = dr->cons = 0; + dr->pending = VDC_TX_RING_SIZE; + dr->ncookies = ncookies; + + return 0; +} + +static void vdc_free_tx_ring(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + if (dr->base) { + ldc_free_exp_dring(port->vio.lp, dr->base, + (dr->entry_size * dr->num_entries), + dr->cookies, dr->ncookies); + dr->base = NULL; + dr->entry_size = 0; + dr->num_entries = 0; + dr->pending = 0; + dr->ncookies = 0; + } +} + +static int probe_disk(struct vdc_port *port) +{ + struct vio_completion comp; + struct request_queue *q; + struct gendisk *g; + int err; + + init_completion(&comp.com); + comp.err = 0; + comp.waiting_for = WAITING_FOR_LINK_UP; + port->vio.cmp = ∁ + + vio_port_up(&port->vio); + + wait_for_completion(&comp.com); + if (comp.err) + return comp.err; + + err = generic_request(port, VD_OP_GET_VTOC, + &port->label, sizeof(port->label)); + if (err < 0) { + printk(KERN_ERR PFX "VD_OP_GET_VTOC returns error %d\n", err); + return err; + } + + err = generic_request(port, VD_OP_GET_DISKGEOM, + &port->geom, sizeof(port->geom)); + if (err < 0) { + printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns " + "error %d\n", err); + return err; + } + + port->vdisk_size = ((u64)port->geom.num_cyl * + (u64)port->geom.num_hd * + (u64)port->geom.num_sec); + + q = blk_init_queue(do_vdc_request, &port->vio.lock); + if (!q) { + printk(KERN_ERR PFX "%s: Could not allocate queue.\n", + port->vio.name); + return -ENOMEM; + } + g = alloc_disk(1 << PARTITION_SHIFT); + if (!g) { + printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", + port->vio.name); + blk_cleanup_queue(q); + return -ENOMEM; + } + + port->disk = g; + + blk_queue_max_hw_segments(q, port->ring_cookies); + blk_queue_max_phys_segments(q, port->ring_cookies); + blk_queue_max_sectors(q, port->max_xfer_size); + g->major = vdc_major; + g->first_minor = port->dev_no << PARTITION_SHIFT; + strcpy(g->disk_name, port->disk_name); + + g->fops = &vdc_fops; + g->queue = q; + g->private_data = port; + g->driverfs_dev = &port->vio.vdev->dev; + + set_capacity(g, port->vdisk_size); + + printk(KERN_INFO PFX "%s: %u sectors (%u MB)\n", + g->disk_name, + port->vdisk_size, (port->vdisk_size >> (20 - 9))); + + add_disk(g); + + return 0; +} + +static struct ldc_channel_config vdc_ldc_cfg = { + .event = vdc_event, + .mtu = 64, + .mode = LDC_MODE_UNRELIABLE, +}; + +static struct vio_driver_ops vdc_vio_ops = { + .send_attr = vdc_send_attr, + .handle_attr = vdc_handle_attr, + .handshake_complete = vdc_handshake_complete, +}; + +static int __devinit vdc_port_probe(struct vio_dev *vdev, + const struct vio_device_id *id) +{ + struct mdesc_handle *hp; + struct vdc_port *port; + unsigned long flags; + struct vdc *vp; + const u64 *port_id; + int err; + + vp = dev_get_drvdata(vdev->dev.parent); + if (!vp) { + printk(KERN_ERR PFX "Cannot find port parent vdc.\n"); + return -ENODEV; + } + + hp = mdesc_grab(); + + port_id = mdesc_get_property(hp, vdev->mp, "id", NULL); + err = -ENODEV; + if (!port_id) { + printk(KERN_ERR PFX "Port lacks id property.\n"); + goto err_out_release_mdesc; + } + if ((*port_id << PARTITION_SHIFT) & ~(u64)MINORMASK) { + printk(KERN_ERR PFX "Port id [%lu] too large.\n", *port_id); + goto err_out_release_mdesc; + } + + port = kzalloc(sizeof(*port), GFP_KERNEL); + err = -ENOMEM; + if (!port) { + printk(KERN_ERR PFX "Cannot allocate vdc_port.\n"); + goto err_out_release_mdesc; + } + + port->vp = vp; + port->dev_no = *port_id; + + if (port->dev_no >= 26) + snprintf(port->disk_name, sizeof(port->disk_name), + VDCBLK_NAME "%c%c", + 'a' + (port->dev_no / 26) - 1, + 'a' + (port->dev_no % 26)); + else + snprintf(port->disk_name, sizeof(port->disk_name), + VDCBLK_NAME "%c", 'a' + (port->dev_no % 26)); + + err = vio_driver_init(&port->vio, vdev, VDEV_DISK, + vdc_versions, ARRAY_SIZE(vdc_versions), + &vdc_vio_ops, port->disk_name); + if (err) + goto err_out_free_port; + + port->vdisk_block_size = 512; + port->max_xfer_size = ((128 * 1024) / port->vdisk_block_size); + port->ring_cookies = ((port->max_xfer_size * + port->vdisk_block_size) / PAGE_SIZE) + 2; + + err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port); + if (err) + goto err_out_free_port; + + err = vdc_alloc_tx_ring(port); + if (err) + goto err_out_free_ldc; + + err = probe_disk(port); + if (err) + goto err_out_free_tx_ring; + + INIT_LIST_HEAD(&port->list); + + spin_lock_irqsave(&vp->lock, flags); + list_add(&port->list, &vp->port_list); + spin_unlock_irqrestore(&vp->lock, flags); + + dev_set_drvdata(&vdev->dev, port); + + mdesc_release(hp); + + return 0; + +err_out_free_tx_ring: + vdc_free_tx_ring(port); + +err_out_free_ldc: + vio_ldc_free(&port->vio); + +err_out_free_port: + kfree(port); + +err_out_release_mdesc: + mdesc_release(hp); + return err; +} + +static int vdc_port_remove(struct vio_dev *vdev) +{ + struct vdc_port *port = dev_get_drvdata(&vdev->dev); + + if (port) { + del_timer_sync(&port->vio.timer); + + vdc_free_tx_ring(port); + vio_ldc_free(&port->vio); + + dev_set_drvdata(&vdev->dev, NULL); + + kfree(port); + } + return 0; +} + +static struct vio_device_id vdc_port_match[] = { + { + .type = "vdc-port", + }, + {}, +}; +MODULE_DEVICE_TABLE(vio, vdc_match); + +static struct vio_driver vdc_port_driver = { + .id_table = vdc_port_match, + .probe = vdc_port_probe, + .remove = vdc_port_remove, + .driver = { + .name = "vdc_port", + .owner = THIS_MODULE, + } +}; + +static int __devinit vdc_probe(struct vio_dev *vdev, + const struct vio_device_id *id) +{ + static int vdc_version_printed; + struct vdc *vp; + + if (vdc_version_printed++ == 0) + printk(KERN_INFO "%s", version); + + vp = kzalloc(sizeof(struct vdc), GFP_KERNEL); + if (!vp) + return -ENOMEM; + + spin_lock_init(&vp->lock); + vp->dev = vdev; + INIT_LIST_HEAD(&vp->port_list); + + dev_set_drvdata(&vdev->dev, vp); + + return 0; +} + +static int vdc_remove(struct vio_dev *vdev) +{ + + struct vdc *vp = dev_get_drvdata(&vdev->dev); + + if (vp) { + kfree(vp); + dev_set_drvdata(&vdev->dev, NULL); + } + return 0; +} + +static struct vio_device_id vdc_match[] = { + { + .type = "block", + }, + {}, +}; +MODULE_DEVICE_TABLE(vio, vdc_match); + +static struct vio_driver vdc_driver = { + .id_table = vdc_match, + .probe = vdc_probe, + .remove = vdc_remove, + .driver = { + .name = "vdc", + .owner = THIS_MODULE, + } +}; + +static int __init vdc_init(void) +{ + int err; + + err = register_blkdev(0, VDCBLK_NAME); + if (err < 0) + goto out_err; + + vdc_major = err; + err = vio_register_driver(&vdc_driver); + if (err) + goto out_unregister_blkdev; + + err = vio_register_driver(&vdc_port_driver); + if (err) + goto out_unregister_vdc; + + return 0; + +out_unregister_vdc: + vio_unregister_driver(&vdc_driver); + +out_unregister_blkdev: + unregister_blkdev(vdc_major, VDCBLK_NAME); + vdc_major = 0; + +out_err: + return err; +} + +static void __exit vdc_exit(void) +{ + vio_unregister_driver(&vdc_port_driver); + vio_unregister_driver(&vdc_driver); + unregister_blkdev(vdc_major, VDCBLK_NAME); +} + +module_init(vdc_init); +module_exit(vdc_exit); diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index d17d64eb7065..7903f9c7839e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -604,6 +604,12 @@ config CASSINI Support for the Sun Cassini chip, aka Sun GigaSwift Ethernet. See also <http://www.sun.com/products-n-solutions/hardware/docs/pdf/817-4341-10.pdf> +config SUNVNET + tristate "Sun Virtual Network support" + depends on SUN_LDOMS + help + Support for virtual network devices under Sun Logical Domains. + config NET_VENDOR_3COM bool "3COM cards" depends on ISA || EISA || MCA || PCI diff --git a/drivers/net/Makefile b/drivers/net/Makefile index c26b8674213c..b95b1b237a26 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_SUNBMAC) += sunbmac.o obj-$(CONFIG_MYRI_SBUS) += myri_sbus.o obj-$(CONFIG_SUNGEM) += sungem.o sungem_phy.o obj-$(CONFIG_CASSINI) += cassini.o +obj-$(CONFIG_SUNVNET) += sunvnet.o obj-$(CONFIG_MACE) += mace.o obj-$(CONFIG_BMAC) += bmac.o diff --git a/drivers/net/sunvnet.c b/drivers/net/sunvnet.c new file mode 100644 index 000000000000..8a667c13faef --- /dev/null +++ b/drivers/net/sunvnet.c @@ -0,0 +1,1164 @@ +/* sunvnet.c: Sun LDOM Virtual Network Driver. + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/ethtool.h> +#include <linux/etherdevice.h> + +#include <asm/vio.h> +#include <asm/ldc.h> + +#include "sunvnet.h" + +#define DRV_MODULE_NAME "sunvnet" +#define PFX DRV_MODULE_NAME ": " +#define DRV_MODULE_VERSION "1.0" +#define DRV_MODULE_RELDATE "June 25, 2007" + +static char version[] __devinitdata = + DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; +MODULE_AUTHOR("David S. Miller (davem@davemloft.net)"); +MODULE_DESCRIPTION("Sun LDOM virtual network driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_MODULE_VERSION); + +/* Ordered from largest major to lowest */ +static struct vio_version vnet_versions[] = { + { .major = 1, .minor = 0 }, +}; + +static inline u32 vnet_tx_dring_avail(struct vio_dring_state *dr) +{ + return vio_dring_avail(dr, VNET_TX_RING_SIZE); +} + +static int vnet_handle_unknown(struct vnet_port *port, void *arg) +{ + struct vio_msg_tag *pkt = arg; + + printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n", + pkt->type, pkt->stype, pkt->stype_env, pkt->sid); + printk(KERN_ERR PFX "Resetting connection.\n"); + + ldc_disconnect(port->vio.lp); + + return -ECONNRESET; +} + +static int vnet_send_attr(struct vio_driver_state *vio) +{ + struct vnet_port *port = to_vnet_port(vio); + struct net_device *dev = port->vp->dev; + struct vio_net_attr_info pkt; + int i; + + memset(&pkt, 0, sizeof(pkt)); + pkt.tag.type = VIO_TYPE_CTRL; + pkt.tag.stype = VIO_SUBTYPE_INFO; + pkt.tag.stype_env = VIO_ATTR_INFO; + pkt.tag.sid = vio_send_sid(vio); + pkt.xfer_mode = VIO_DRING_MODE; + pkt.addr_type = VNET_ADDR_ETHERMAC; + pkt.ack_freq = 0; + for (i = 0; i < 6; i++) + pkt.addr |= (u64)dev->dev_addr[i] << ((5 - i) * 8); + pkt.mtu = ETH_FRAME_LEN; + + viodbg(HS, "SEND NET ATTR xmode[0x%x] atype[0x%x] addr[%llx] " + "ackfreq[%u] mtu[%llu]\n", + pkt.xfer_mode, pkt.addr_type, + (unsigned long long) pkt.addr, + pkt.ack_freq, + (unsigned long long) pkt.mtu); + + return vio_ldc_send(vio, &pkt, sizeof(pkt)); +} + +static int handle_attr_info(struct vio_driver_state *vio, + struct vio_net_attr_info *pkt) +{ + viodbg(HS, "GOT NET ATTR INFO xmode[0x%x] atype[0x%x] addr[%llx] " + "ackfreq[%u] mtu[%llu]\n", + pkt->xfer_mode, pkt->addr_type, + (unsigned long long) pkt->addr, + pkt->ack_freq, + (unsigned long long) pkt->mtu); + + pkt->tag.sid = vio_send_sid(vio); + + if (pkt->xfer_mode != VIO_DRING_MODE || + pkt->addr_type != VNET_ADDR_ETHERMAC || + pkt->mtu != ETH_FRAME_LEN) { + viodbg(HS, "SEND NET ATTR NACK\n"); + + pkt->tag.stype = VIO_SUBTYPE_NACK; + + (void) vio_ldc_send(vio, pkt, sizeof(*pkt)); + + return -ECONNRESET; + } else { + viodbg(HS, "SEND NET ATTR ACK\n"); + + pkt->tag.stype = VIO_SUBTYPE_ACK; + + return vio_ldc_send(vio, pkt, sizeof(*pkt)); + } + +} + +static int handle_attr_ack(struct vio_driver_state *vio, + struct vio_net_attr_info *pkt) +{ + viodbg(HS, "GOT NET ATTR ACK\n"); + + return 0; +} + +static int handle_attr_nack(struct vio_driver_state *vio, + struct vio_net_attr_info *pkt) +{ + viodbg(HS, "GOT NET ATTR NACK\n"); + + return -ECONNRESET; +} + +static int vnet_handle_attr(struct vio_driver_state *vio, void *arg) +{ + struct vio_net_attr_info *pkt = arg; + + switch (pkt->tag.stype) { + case VIO_SUBTYPE_INFO: + return handle_attr_info(vio, pkt); + + case VIO_SUBTYPE_ACK: + return handle_attr_ack(vio, pkt); + + case VIO_SUBTYPE_NACK: + return handle_attr_nack(vio, pkt); + + default: + return -ECONNRESET; + } +} + +static void vnet_handshake_complete(struct vio_driver_state *vio) +{ + struct vio_dring_state *dr; + + dr = &vio->drings[VIO_DRIVER_RX_RING]; + dr->snd_nxt = dr->rcv_nxt = 1; + + dr = &vio->drings[VIO_DRIVER_TX_RING]; + dr->snd_nxt = dr->rcv_nxt = 1; +} + +/* The hypervisor interface that implements copying to/from imported + * memory from another domain requires that copies are done to 8-byte + * aligned buffers, and that the lengths of such copies are also 8-byte + * multiples. + * + * So we align skb->data to an 8-byte multiple and pad-out the data + * area so we can round the copy length up to the next multiple of + * 8 for the copy. + * + * The transmitter puts the actual start of the packet 6 bytes into + * the buffer it sends over, so that the IP headers after the ethernet + * header are aligned properly. These 6 bytes are not in the descriptor + * length, they are simply implied. This offset is represented using + * the VNET_PACKET_SKIP macro. + */ +static struct sk_buff *alloc_and_align_skb(struct net_device *dev, + unsigned int len) +{ + struct sk_buff *skb = netdev_alloc_skb(dev, len+VNET_PACKET_SKIP+8+8); + unsigned long addr, off; + + if (unlikely(!skb)) + return NULL; + + addr = (unsigned long) skb->data; + off = ((addr + 7UL) & ~7UL) - addr; + if (off) + skb_reserve(skb, off); + + return skb; +} + +static int vnet_rx_one(struct vnet_port *port, unsigned int len, + struct ldc_trans_cookie *cookies, int ncookies) +{ + struct net_device *dev = port->vp->dev; + unsigned int copy_len; + struct sk_buff *skb; + int err; + + err = -EMSGSIZE; + if (unlikely(len < ETH_ZLEN || len > ETH_FRAME_LEN)) { + dev->stats.rx_length_errors++; + goto out_dropped; + } + + skb = alloc_and_align_skb(dev, len); + err = -ENOMEM; + if (unlikely(!skb)) { + dev->stats.rx_missed_errors++; + goto out_dropped; + } + + copy_len = (len + VNET_PACKET_SKIP + 7U) & ~7U; + skb_put(skb, copy_len); + err = ldc_copy(port->vio.lp, LDC_COPY_IN, + skb->data, copy_len, 0, + cookies, ncookies); + if (unlikely(err < 0)) { + dev->stats.rx_frame_errors++; + goto out_free_skb; + } + + skb_pull(skb, VNET_PACKET_SKIP); + skb_trim(skb, len); + skb->protocol = eth_type_trans(skb, dev); + + dev->stats.rx_packets++; + dev->stats.rx_bytes += len; + + netif_rx(skb); + + return 0; + +out_free_skb: + kfree_skb(skb); + +out_dropped: + dev->stats.rx_dropped++; + return err; +} + +static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr, + u32 start, u32 end, u8 vio_dring_state) +{ + struct vio_dring_data hdr = { + .tag = { + .type = VIO_TYPE_DATA, + .stype = VIO_SUBTYPE_ACK, + .stype_env = VIO_DRING_DATA, + .sid = vio_send_sid(&port->vio), + }, + .dring_ident = dr->ident, + .start_idx = start, + .end_idx = end, + .state = vio_dring_state, + }; + int err, delay; + + hdr.seq = dr->snd_nxt; + delay = 1; + do { + err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr)); + if (err > 0) { + dr->snd_nxt++; + break; + } + udelay(delay); + if ((delay <<= 1) > 128) + delay = 128; + } while (err == -EAGAIN); + + return err; +} + +static u32 next_idx(u32 idx, struct vio_dring_state *dr) +{ + if (++idx == dr->num_entries) + idx = 0; + return idx; +} + +static u32 prev_idx(u32 idx, struct vio_dring_state *dr) +{ + if (idx == 0) + idx = dr->num_entries - 1; + else + idx--; + + return idx; +} + +static struct vio_net_desc *get_rx_desc(struct vnet_port *port, + struct vio_dring_state *dr, + u32 index) +{ + struct vio_net_desc *desc = port->vio.desc_buf; + int err; + + err = ldc_get_dring_entry(port->vio.lp, desc, dr->entry_size, + (index * dr->entry_size), + dr->cookies, dr->ncookies); + if (err < 0) + return ERR_PTR(err); + + return desc; +} + +static int put_rx_desc(struct vnet_port *port, + struct vio_dring_state *dr, + struct vio_net_desc *desc, + u32 index) +{ + int err; + + err = ldc_put_dring_entry(port->vio.lp, desc, dr->entry_size, + (index * dr->entry_size), + dr->cookies, dr->ncookies); + if (err < 0) + return err; + + return 0; +} + +static int vnet_walk_rx_one(struct vnet_port *port, + struct vio_dring_state *dr, + u32 index, int *needs_ack) +{ + struct vio_net_desc *desc = get_rx_desc(port, dr, index); + struct vio_driver_state *vio = &port->vio; + int err; + + if (IS_ERR(desc)) + return PTR_ERR(desc); + + viodbg(DATA, "vio_walk_rx_one desc[%02x:%02x:%08x:%08x:%lx:%lx]\n", + desc->hdr.state, desc->hdr.ack, + desc->size, desc->ncookies, + desc->cookies[0].cookie_addr, + desc->cookies[0].cookie_size); + + if (desc->hdr.state != VIO_DESC_READY) + return 1; + err = vnet_rx_one(port, desc->size, desc->cookies, desc->ncookies); + if (err == -ECONNRESET) + return err; + desc->hdr.state = VIO_DESC_DONE; + err = put_rx_desc(port, dr, desc, index); + if (err < 0) + return err; + *needs_ack = desc->hdr.ack; + return 0; +} + +static int vnet_walk_rx(struct vnet_port *port, struct vio_dring_state *dr, + u32 start, u32 end) +{ + struct vio_driver_state *vio = &port->vio; + int ack_start = -1, ack_end = -1; + + end = (end == (u32) -1) ? prev_idx(start, dr) : next_idx(end, dr); + + viodbg(DATA, "vnet_walk_rx start[%08x] end[%08x]\n", start, end); + + while (start != end) { + int ack = 0, err = vnet_walk_rx_one(port, dr, start, &ack); + if (err == -ECONNRESET) + return err; + if (err != 0) + break; + if (ack_start == -1) + ack_start = start; + ack_end = start; + start = next_idx(start, dr); + if (ack && start != end) { + err = vnet_send_ack(port, dr, ack_start, ack_end, + VIO_DRING_ACTIVE); + if (err == -ECONNRESET) + return err; + ack_start = -1; + } + } + if (unlikely(ack_start == -1)) + ack_start = ack_end = prev_idx(start, dr); + return vnet_send_ack(port, dr, ack_start, ack_end, VIO_DRING_STOPPED); +} + +static int vnet_rx(struct vnet_port *port, void *msgbuf) +{ + struct vio_dring_data *pkt = msgbuf; + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_RX_RING]; + struct vio_driver_state *vio = &port->vio; + + viodbg(DATA, "vnet_rx stype_env[%04x] seq[%016lx] rcv_nxt[%016lx]\n", + pkt->tag.stype_env, pkt->seq, dr->rcv_nxt); + + if (unlikely(pkt->tag.stype_env != VIO_DRING_DATA)) + return 0; + if (unlikely(pkt->seq != dr->rcv_nxt)) { + printk(KERN_ERR PFX "RX out of sequence seq[0x%lx] " + "rcv_nxt[0x%lx]\n", pkt->seq, dr->rcv_nxt); + return 0; + } + + dr->rcv_nxt++; + + /* XXX Validate pkt->start_idx and pkt->end_idx XXX */ + + return vnet_walk_rx(port, dr, pkt->start_idx, pkt->end_idx); +} + +static int idx_is_pending(struct vio_dring_state *dr, u32 end) +{ + u32 idx = dr->cons; + int found = 0; + + while (idx != dr->prod) { + if (idx == end) { + found = 1; + break; + } + idx = next_idx(idx, dr); + } + return found; +} + +static int vnet_ack(struct vnet_port *port, void *msgbuf) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct vio_dring_data *pkt = msgbuf; + struct net_device *dev; + struct vnet *vp; + u32 end; + + if (unlikely(pkt->tag.stype_env != VIO_DRING_DATA)) + return 0; + + end = pkt->end_idx; + if (unlikely(!idx_is_pending(dr, end))) + return 0; + + dr->cons = next_idx(end, dr); + + vp = port->vp; + dev = vp->dev; + if (unlikely(netif_queue_stopped(dev) && + vnet_tx_dring_avail(dr) >= VNET_TX_WAKEUP_THRESH(dr))) + return 1; + + return 0; +} + +static int vnet_nack(struct vnet_port *port, void *msgbuf) +{ + /* XXX just reset or similar XXX */ + return 0; +} + +static void maybe_tx_wakeup(struct vnet *vp) +{ + struct net_device *dev = vp->dev; + + netif_tx_lock(dev); + if (likely(netif_queue_stopped(dev))) { + struct vnet_port *port; + int wake = 1; + + list_for_each_entry(port, &vp->port_list, list) { + struct vio_dring_state *dr; + + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + if (vnet_tx_dring_avail(dr) < + VNET_TX_WAKEUP_THRESH(dr)) { + wake = 0; + break; + } + } + if (wake) + netif_wake_queue(dev); + } + netif_tx_unlock(dev); +} + +static void vnet_event(void *arg, int event) +{ + struct vnet_port *port = arg; + struct vio_driver_state *vio = &port->vio; + unsigned long flags; + int tx_wakeup, err; + + spin_lock_irqsave(&vio->lock, flags); + + if (unlikely(event == LDC_EVENT_RESET || + event == LDC_EVENT_UP)) { + vio_link_state_change(vio, event); + spin_unlock_irqrestore(&vio->lock, flags); + + return; + } + + if (unlikely(event != LDC_EVENT_DATA_READY)) { + printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event); + spin_unlock_irqrestore(&vio->lock, flags); + return; + } + + tx_wakeup = err = 0; + while (1) { + union { + struct vio_msg_tag tag; + u64 raw[8]; + } msgbuf; + + err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf)); + if (unlikely(err < 0)) { + if (err == -ECONNRESET) + vio_conn_reset(vio); + break; + } + if (err == 0) + break; + viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n", + msgbuf.tag.type, + msgbuf.tag.stype, + msgbuf.tag.stype_env, + msgbuf.tag.sid); + err = vio_validate_sid(vio, &msgbuf.tag); + if (err < 0) + break; + + if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) { + if (msgbuf.tag.stype == VIO_SUBTYPE_INFO) { + err = vnet_rx(port, &msgbuf); + } else if (msgbuf.tag.stype == VIO_SUBTYPE_ACK) { + err = vnet_ack(port, &msgbuf); + if (err > 0) + tx_wakeup |= err; + } else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK) { + err = vnet_nack(port, &msgbuf); + } + } else if (msgbuf.tag.type == VIO_TYPE_CTRL) { + err = vio_control_pkt_engine(vio, &msgbuf); + if (err) + break; + } else { + err = vnet_handle_unknown(port, &msgbuf); + } + if (err == -ECONNRESET) + break; + } + spin_unlock(&vio->lock); + if (unlikely(tx_wakeup && err != -ECONNRESET)) + maybe_tx_wakeup(port->vp); + local_irq_restore(flags); +} + +static int __vnet_tx_trigger(struct vnet_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct vio_dring_data hdr = { + .tag = { + .type = VIO_TYPE_DATA, + .stype = VIO_SUBTYPE_INFO, + .stype_env = VIO_DRING_DATA, + .sid = vio_send_sid(&port->vio), + }, + .dring_ident = dr->ident, + .start_idx = dr->prod, + .end_idx = (u32) -1, + }; + int err, delay; + + hdr.seq = dr->snd_nxt; + delay = 1; + do { + err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr)); + if (err > 0) { + dr->snd_nxt++; + break; + } + udelay(delay); + if ((delay <<= 1) > 128) + delay = 128; + } while (err == -EAGAIN); + + return err; +} + +struct vnet_port *__tx_port_find(struct vnet *vp, struct sk_buff *skb) +{ + unsigned int hash = vnet_hashfn(skb->data); + struct hlist_head *hp = &vp->port_hash[hash]; + struct hlist_node *n; + struct vnet_port *port; + + hlist_for_each_entry(port, n, hp, hash) { + if (!compare_ether_addr(port->raddr, skb->data)) + return port; + } + port = NULL; + if (!list_empty(&vp->port_list)) + port = list_entry(vp->port_list.next, struct vnet_port, list); + + return port; +} + +struct vnet_port *tx_port_find(struct vnet *vp, struct sk_buff *skb) +{ + struct vnet_port *ret; + unsigned long flags; + + spin_lock_irqsave(&vp->lock, flags); + ret = __tx_port_find(vp, skb); + spin_unlock_irqrestore(&vp->lock, flags); + + return ret; +} + +static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct vnet *vp = netdev_priv(dev); + struct vnet_port *port = tx_port_find(vp, skb); + struct vio_dring_state *dr; + struct vio_net_desc *d; + unsigned long flags; + unsigned int len; + void *tx_buf; + int i, err; + + if (unlikely(!port)) + goto out_dropped; + + spin_lock_irqsave(&port->vio.lock, flags); + + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + if (unlikely(vnet_tx_dring_avail(dr) < 2)) { + if (!netif_queue_stopped(dev)) { + netif_stop_queue(dev); + + /* This is a hard error, log it. */ + printk(KERN_ERR PFX "%s: BUG! Tx Ring full when " + "queue awake!\n", dev->name); + dev->stats.tx_errors++; + } + spin_unlock_irqrestore(&port->vio.lock, flags); + return NETDEV_TX_BUSY; + } + + d = vio_dring_cur(dr); + + tx_buf = port->tx_bufs[dr->prod].buf; + skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len); + + len = skb->len; + if (len < ETH_ZLEN) { + len = ETH_ZLEN; + memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len); + } + + d->hdr.ack = VIO_ACK_ENABLE; + d->size = len; + d->ncookies = port->tx_bufs[dr->prod].ncookies; + for (i = 0; i < d->ncookies; i++) + d->cookies[i] = port->tx_bufs[dr->prod].cookies[i]; + + /* This has to be a non-SMP write barrier because we are writing + * to memory which is shared with the peer LDOM. + */ + wmb(); + + d->hdr.state = VIO_DESC_READY; + + err = __vnet_tx_trigger(port); + if (unlikely(err < 0)) { + printk(KERN_INFO PFX "%s: TX trigger error %d\n", + dev->name, err); + d->hdr.state = VIO_DESC_FREE; + dev->stats.tx_carrier_errors++; + goto out_dropped_unlock; + } + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1); + if (unlikely(vnet_tx_dring_avail(dr) < 2)) { + netif_stop_queue(dev); + if (vnet_tx_dring_avail(dr) > VNET_TX_WAKEUP_THRESH(dr)) + netif_wake_queue(dev); + } + + spin_unlock_irqrestore(&port->vio.lock, flags); + + dev_kfree_skb(skb); + + dev->trans_start = jiffies; + return NETDEV_TX_OK; + +out_dropped_unlock: + spin_unlock_irqrestore(&port->vio.lock, flags); + +out_dropped: + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} + +static void vnet_tx_timeout(struct net_device *dev) +{ + /* XXX Implement me XXX */ +} + +static int vnet_open(struct net_device *dev) +{ + netif_carrier_on(dev); + netif_start_queue(dev); + + return 0; +} + +static int vnet_close(struct net_device *dev) +{ + netif_stop_queue(dev); + netif_carrier_off(dev); + + return 0; +} + +static void vnet_set_rx_mode(struct net_device *dev) +{ + /* XXX Implement multicast support XXX */ +} + +static int vnet_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu != ETH_DATA_LEN) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static int vnet_set_mac_addr(struct net_device *dev, void *p) +{ + return -EINVAL; +} + +static void vnet_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strcpy(info->driver, DRV_MODULE_NAME); + strcpy(info->version, DRV_MODULE_VERSION); +} + +static u32 vnet_get_msglevel(struct net_device *dev) +{ + struct vnet *vp = netdev_priv(dev); + return vp->msg_enable; +} + +static void vnet_set_msglevel(struct net_device *dev, u32 value) +{ + struct vnet *vp = netdev_priv(dev); + vp->msg_enable = value; +} + +static const struct ethtool_ops vnet_ethtool_ops = { + .get_drvinfo = vnet_get_drvinfo, + .get_msglevel = vnet_get_msglevel, + .set_msglevel = vnet_set_msglevel, + .get_link = ethtool_op_get_link, + .get_perm_addr = ethtool_op_get_perm_addr, +}; + +static void vnet_port_free_tx_bufs(struct vnet_port *port) +{ + struct vio_dring_state *dr; + int i; + + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + if (dr->base) { + ldc_free_exp_dring(port->vio.lp, dr->base, + (dr->entry_size * dr->num_entries), + dr->cookies, dr->ncookies); + dr->base = NULL; + dr->entry_size = 0; + dr->num_entries = 0; + dr->pending = 0; + dr->ncookies = 0; + } + + for (i = 0; i < VNET_TX_RING_SIZE; i++) { + void *buf = port->tx_bufs[i].buf; + + if (!buf) + continue; + + ldc_unmap(port->vio.lp, + port->tx_bufs[i].cookies, + port->tx_bufs[i].ncookies); + + kfree(buf); + port->tx_bufs[i].buf = NULL; + } +} + +static int __devinit vnet_port_alloc_tx_bufs(struct vnet_port *port) +{ + struct vio_dring_state *dr; + unsigned long len; + int i, err, ncookies; + void *dring; + + for (i = 0; i < VNET_TX_RING_SIZE; i++) { + void *buf = kzalloc(ETH_FRAME_LEN + 8, GFP_KERNEL); + int map_len = (ETH_FRAME_LEN + 7) & ~7; + + err = -ENOMEM; + if (!buf) { + printk(KERN_ERR "TX buffer allocation failure\n"); + goto err_out; + } + err = -EFAULT; + if ((unsigned long)buf & (8UL - 1)) { + printk(KERN_ERR "TX buffer misaligned\n"); + kfree(buf); + goto err_out; + } + + err = ldc_map_single(port->vio.lp, buf, map_len, + port->tx_bufs[i].cookies, 2, + (LDC_MAP_SHADOW | + LDC_MAP_DIRECT | + LDC_MAP_RW)); + if (err < 0) { + kfree(buf); + goto err_out; + } + port->tx_bufs[i].buf = buf; + port->tx_bufs[i].ncookies = err; + } + + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + len = (VNET_TX_RING_SIZE * + (sizeof(struct vio_net_desc) + + (sizeof(struct ldc_trans_cookie) * 2))); + + ncookies = VIO_MAX_RING_COOKIES; + dring = ldc_alloc_exp_dring(port->vio.lp, len, + dr->cookies, &ncookies, + (LDC_MAP_SHADOW | + LDC_MAP_DIRECT | + LDC_MAP_RW)); + if (IS_ERR(dring)) { + err = PTR_ERR(dring); + goto err_out; + } + + dr->base = dring; + dr->entry_size = (sizeof(struct vio_net_desc) + + (sizeof(struct ldc_trans_cookie) * 2)); + dr->num_entries = VNET_TX_RING_SIZE; + dr->prod = dr->cons = 0; + dr->pending = VNET_TX_RING_SIZE; + dr->ncookies = ncookies; + + return 0; + +err_out: + vnet_port_free_tx_bufs(port); + + return err; +} + +static struct ldc_channel_config vnet_ldc_cfg = { + .event = vnet_event, + .mtu = 64, + .mode = LDC_MODE_UNRELIABLE, +}; + +static struct vio_driver_ops vnet_vio_ops = { + .send_attr = vnet_send_attr, + .handle_attr = vnet_handle_attr, + .handshake_complete = vnet_handshake_complete, +}; + +const char *remote_macaddr_prop = "remote-mac-address"; + +static int __devinit vnet_port_probe(struct vio_dev *vdev, + const struct vio_device_id *id) +{ + struct mdesc_handle *hp; + struct vnet_port *port; + unsigned long flags; + struct vnet *vp; + const u64 *rmac; + int len, i, err, switch_port; + + vp = dev_get_drvdata(vdev->dev.parent); + if (!vp) { + printk(KERN_ERR PFX "Cannot find port parent vnet.\n"); + return -ENODEV; + } + + hp = mdesc_grab(); + + rmac = mdesc_get_property(hp, vdev->mp, remote_macaddr_prop, &len); + err = -ENODEV; + if (!rmac) { + printk(KERN_ERR PFX "Port lacks %s property.\n", + remote_macaddr_prop); + goto err_out_put_mdesc; + } + + port = kzalloc(sizeof(*port), GFP_KERNEL); + err = -ENOMEM; + if (!port) { + printk(KERN_ERR PFX "Cannot allocate vnet_port.\n"); + goto err_out_put_mdesc; + } + + for (i = 0; i < ETH_ALEN; i++) + port->raddr[i] = (*rmac >> (5 - i) * 8) & 0xff; + + port->vp = vp; + + err = vio_driver_init(&port->vio, vdev, VDEV_NETWORK, + vnet_versions, ARRAY_SIZE(vnet_versions), + &vnet_vio_ops, vp->dev->name); + if (err) + goto err_out_free_port; + + err = vio_ldc_alloc(&port->vio, &vnet_ldc_cfg, port); + if (err) + goto err_out_free_port; + + err = vnet_port_alloc_tx_bufs(port); + if (err) + goto err_out_free_ldc; + + INIT_HLIST_NODE(&port->hash); + INIT_LIST_HEAD(&port->list); + + switch_port = 0; + if (mdesc_get_property(hp, vdev->mp, "switch-port", NULL) != NULL) + switch_port = 1; + + spin_lock_irqsave(&vp->lock, flags); + if (switch_port) + list_add(&port->list, &vp->port_list); + else + list_add_tail(&port->list, &vp->port_list); + hlist_add_head(&port->hash, &vp->port_hash[vnet_hashfn(port->raddr)]); + spin_unlock_irqrestore(&vp->lock, flags); + + dev_set_drvdata(&vdev->dev, port); + + printk(KERN_INFO "%s: PORT ( remote-mac ", vp->dev->name); + for (i = 0; i < 6; i++) + printk("%2.2x%c", port->raddr[i], i == 5 ? ' ' : ':'); + if (switch_port) + printk("switch-port "); + printk(")\n"); + + vio_port_up(&port->vio); + + mdesc_release(hp); + + return 0; + +err_out_free_ldc: + vio_ldc_free(&port->vio); + +err_out_free_port: + kfree(port); + +err_out_put_mdesc: + mdesc_release(hp); + return err; +} + +static int vnet_port_remove(struct vio_dev *vdev) +{ + struct vnet_port *port = dev_get_drvdata(&vdev->dev); + + if (port) { + struct vnet *vp = port->vp; + unsigned long flags; + + del_timer_sync(&port->vio.timer); + + spin_lock_irqsave(&vp->lock, flags); + list_del(&port->list); + hlist_del(&port->hash); + spin_unlock_irqrestore(&vp->lock, flags); + + vnet_port_free_tx_bufs(port); + vio_ldc_free(&port->vio); + + dev_set_drvdata(&vdev->dev, NULL); + + kfree(port); + } + return 0; +} + +static struct vio_device_id vnet_port_match[] = { + { + .type = "vnet-port", + }, + {}, +}; +MODULE_DEVICE_TABLE(vio, vnet_match); + +static struct vio_driver vnet_port_driver = { + .id_table = vnet_port_match, + .probe = vnet_port_probe, + .remove = vnet_port_remove, + .driver = { + .name = "vnet_port", + .owner = THIS_MODULE, + } +}; + +const char *local_mac_prop = "local-mac-address"; + +static int __devinit vnet_probe(struct vio_dev *vdev, + const struct vio_device_id *id) +{ + static int vnet_version_printed; + struct mdesc_handle *hp; + struct net_device *dev; + struct vnet *vp; + const u64 *mac; + int err, i, len; + + if (vnet_version_printed++ == 0) + printk(KERN_INFO "%s", version); + + hp = mdesc_grab(); + + mac = mdesc_get_property(hp, vdev->mp, local_mac_prop, &len); + if (!mac) { + printk(KERN_ERR PFX "vnet lacks %s property.\n", + local_mac_prop); + err = -ENODEV; + goto err_out; + } + + dev = alloc_etherdev(sizeof(*vp)); + if (!dev) { + printk(KERN_ERR PFX "Etherdev alloc failed, aborting.\n"); + err = -ENOMEM; + goto err_out; + } + + for (i = 0; i < ETH_ALEN; i++) + dev->dev_addr[i] = (*mac >> (5 - i) * 8) & 0xff; + + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); + + SET_NETDEV_DEV(dev, &vdev->dev); + + vp = netdev_priv(dev); + + spin_lock_init(&vp->lock); + vp->dev = dev; + vp->vdev = vdev; + + INIT_LIST_HEAD(&vp->port_list); + for (i = 0; i < VNET_PORT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&vp->port_hash[i]); + + dev->open = vnet_open; + dev->stop = vnet_close; + dev->set_multicast_list = vnet_set_rx_mode; + dev->set_mac_address = vnet_set_mac_addr; + dev->tx_timeout = vnet_tx_timeout; + dev->ethtool_ops = &vnet_ethtool_ops; + dev->watchdog_timeo = VNET_TX_TIMEOUT; + dev->change_mtu = vnet_change_mtu; + dev->hard_start_xmit = vnet_start_xmit; + + err = register_netdev(dev); + if (err) { + printk(KERN_ERR PFX "Cannot register net device, " + "aborting.\n"); + goto err_out_free_dev; + } + + printk(KERN_INFO "%s: Sun LDOM vnet ", dev->name); + + for (i = 0; i < 6; i++) + printk("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':'); + + dev_set_drvdata(&vdev->dev, vp); + + mdesc_release(hp); + + return 0; + +err_out_free_dev: + free_netdev(dev); + +err_out: + mdesc_release(hp); + return err; +} + +static int vnet_remove(struct vio_dev *vdev) +{ + + struct vnet *vp = dev_get_drvdata(&vdev->dev); + + if (vp) { + /* XXX unregister port, or at least check XXX */ + unregister_netdevice(vp->dev); + dev_set_drvdata(&vdev->dev, NULL); + } + return 0; +} + +static struct vio_device_id vnet_match[] = { + { + .type = "network", + }, + {}, +}; +MODULE_DEVICE_TABLE(vio, vnet_match); + +static struct vio_driver vnet_driver = { + .id_table = vnet_match, + .probe = vnet_probe, + .remove = vnet_remove, + .driver = { + .name = "vnet", + .owner = THIS_MODULE, + } +}; + +static int __init vnet_init(void) +{ + int err = vio_register_driver(&vnet_driver); + + if (!err) { + err = vio_register_driver(&vnet_port_driver); + if (err) + vio_unregister_driver(&vnet_driver); + } + + return err; +} + +static void __exit vnet_exit(void) +{ + vio_unregister_driver(&vnet_port_driver); + vio_unregister_driver(&vnet_driver); +} + +module_init(vnet_init); +module_exit(vnet_exit); diff --git a/drivers/net/sunvnet.h b/drivers/net/sunvnet.h new file mode 100644 index 000000000000..1c887302d46d --- /dev/null +++ b/drivers/net/sunvnet.h @@ -0,0 +1,70 @@ +#ifndef _SUNVNET_H +#define _SUNVNET_H + +#define DESC_NCOOKIES(entry_size) \ + ((entry_size) - sizeof(struct vio_net_desc)) + +/* length of time before we decide the hardware is borked, + * and dev->tx_timeout() should be called to fix the problem + */ +#define VNET_TX_TIMEOUT (5 * HZ) + +#define VNET_TX_RING_SIZE 512 +#define VNET_TX_WAKEUP_THRESH(dr) ((dr)->pending / 4) + +/* VNET packets are sent in buffers with the first 6 bytes skipped + * so that after the ethernet header the IPv4/IPv6 headers are aligned + * properly. + */ +#define VNET_PACKET_SKIP 6 + +struct vnet_tx_entry { + void *buf; + unsigned int ncookies; + struct ldc_trans_cookie cookies[2]; +}; + +struct vnet; +struct vnet_port { + struct vio_driver_state vio; + + struct hlist_node hash; + u8 raddr[ETH_ALEN]; + + struct vnet *vp; + + struct vnet_tx_entry tx_bufs[VNET_TX_RING_SIZE]; + + struct list_head list; +}; + +static inline struct vnet_port *to_vnet_port(struct vio_driver_state *vio) +{ + return container_of(vio, struct vnet_port, vio); +} + +#define VNET_PORT_HASH_SIZE 16 +#define VNET_PORT_HASH_MASK (VNET_PORT_HASH_SIZE - 1) + +static inline unsigned int vnet_hashfn(u8 *mac) +{ + unsigned int val = mac[4] ^ mac[5]; + + return val & (VNET_PORT_HASH_MASK); +} + +struct vnet { + /* Protects port_list and port_hash. */ + spinlock_t lock; + + struct net_device *dev; + + u32 msg_enable; + struct vio_dev *vdev; + + struct list_head port_list; + + struct hlist_head port_hash[VNET_PORT_HASH_SIZE]; +}; + +#endif /* _SUNVNET_H */ diff --git a/drivers/serial/sunhv.c b/drivers/serial/sunhv.c index 96557e6dba60..17bcca53d6a1 100644 --- a/drivers/serial/sunhv.c +++ b/drivers/serial/sunhv.c @@ -440,8 +440,16 @@ static void sunhv_console_write_paged(struct console *con, const char *s, unsign { struct uart_port *port = sunhv_port; unsigned long flags; + int locked = 1; + + local_irq_save(flags); + if (port->sysrq) { + locked = 0; + } else if (oops_in_progress) { + locked = spin_trylock(&port->lock); + } else + spin_lock(&port->lock); - spin_lock_irqsave(&port->lock, flags); while (n > 0) { unsigned long ra = __pa(con_write_page); unsigned long page_bytes; @@ -469,7 +477,10 @@ static void sunhv_console_write_paged(struct console *con, const char *s, unsign ra += written; } } - spin_unlock_irqrestore(&port->lock, flags); + + if (locked) + spin_unlock(&port->lock); + local_irq_restore(flags); } static inline void sunhv_console_putchar(struct uart_port *port, char c) @@ -488,7 +499,15 @@ static void sunhv_console_write_bychar(struct console *con, const char *s, unsig { struct uart_port *port = sunhv_port; unsigned long flags; - int i; + int i, locked = 1; + + local_irq_save(flags); + if (port->sysrq) { + locked = 0; + } else if (oops_in_progress) { + locked = spin_trylock(&port->lock); + } else + spin_lock(&port->lock); spin_lock_irqsave(&port->lock, flags); for (i = 0; i < n; i++) { @@ -496,7 +515,10 @@ static void sunhv_console_write_bychar(struct console *con, const char *s, unsig sunhv_console_putchar(port, '\r'); sunhv_console_putchar(port, *s++); } - spin_unlock_irqrestore(&port->lock, flags); + + if (locked) + spin_unlock(&port->lock); + local_irq_restore(flags); } static struct console sunhv_console = { diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c index deb9ab4b5a0b..8a0f9e4408d4 100644 --- a/drivers/serial/sunsab.c +++ b/drivers/serial/sunsab.c @@ -860,22 +860,31 @@ static int num_channels; static void sunsab_console_putchar(struct uart_port *port, int c) { struct uart_sunsab_port *up = (struct uart_sunsab_port *)port; - unsigned long flags; - - spin_lock_irqsave(&up->port.lock, flags); sunsab_tec_wait(up); writeb(c, &up->regs->w.tic); - - spin_unlock_irqrestore(&up->port.lock, flags); } static void sunsab_console_write(struct console *con, const char *s, unsigned n) { struct uart_sunsab_port *up = &sunsab_ports[con->index]; + unsigned long flags; + int locked = 1; + + local_irq_save(flags); + if (up->port.sysrq) { + locked = 0; + } else if (oops_in_progress) { + locked = spin_trylock(&up->port.lock); + } else + spin_lock(&up->port.lock); uart_console_write(&up->port, s, n, sunsab_console_putchar); sunsab_tec_wait(up); + + if (locked) + spin_unlock(&up->port.lock); + local_irq_restore(flags); } static int sunsab_console_setup(struct console *con, char *options) diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c index 2a63cdba3208..26d720baf88c 100644 --- a/drivers/serial/sunsu.c +++ b/drivers/serial/sunsu.c @@ -1288,7 +1288,17 @@ static void sunsu_console_write(struct console *co, const char *s, unsigned int count) { struct uart_sunsu_port *up = &sunsu_ports[co->index]; + unsigned long flags; unsigned int ier; + int locked = 1; + + local_irq_save(flags); + if (up->port.sysrq) { + locked = 0; + } else if (oops_in_progress) { + locked = spin_trylock(&up->port.lock); + } else + spin_lock(&up->port.lock); /* * First save the UER then disable the interrupts @@ -1304,6 +1314,10 @@ static void sunsu_console_write(struct console *co, const char *s, */ wait_for_xmitr(up); serial_out(up, UART_IER, ier); + + if (locked) + spin_unlock(&up->port.lock); + local_irq_restore(flags); } /* diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c index 15b6e1cb040b..0a3e10a4a35d 100644 --- a/drivers/serial/sunzilog.c +++ b/drivers/serial/sunzilog.c @@ -9,7 +9,7 @@ * C. Dost, Pete Zaitcev, Ted Ts'o and Alex Buell for their * work there. * - * Copyright (C) 2002, 2006 David S. Miller (davem@davemloft.net) + * Copyright (C) 2002, 2006, 2007 David S. Miller (davem@davemloft.net) */ #include <linux/module.h> @@ -1151,11 +1151,22 @@ sunzilog_console_write(struct console *con, const char *s, unsigned int count) { struct uart_sunzilog_port *up = &sunzilog_port_table[con->index]; unsigned long flags; + int locked = 1; + + local_irq_save(flags); + if (up->port.sysrq) { + locked = 0; + } else if (oops_in_progress) { + locked = spin_trylock(&up->port.lock); + } else + spin_lock(&up->port.lock); - spin_lock_irqsave(&up->port.lock, flags); uart_console_write(&up->port, s, count, sunzilog_putchar); udelay(2); - spin_unlock_irqrestore(&up->port.lock, flags); + + if (locked) + spin_unlock(&up->port.lock); + local_irq_restore(flags); } static int __init sunzilog_console_setup(struct console *con, char *options) diff --git a/include/asm-sparc64/bugs.h b/include/asm-sparc64/bugs.h index bf39d86c0c9e..11ade6841971 100644 --- a/include/asm-sparc64/bugs.h +++ b/include/asm-sparc64/bugs.h @@ -4,12 +4,7 @@ */ #include <asm/sstate.h> -extern unsigned long loops_per_jiffy; - static void __init check_bugs(void) { -#ifndef CONFIG_SMP - cpu_data(0).udelay_val = loops_per_jiffy; -#endif sstate_running(); } diff --git a/include/asm-sparc64/cpudata.h b/include/asm-sparc64/cpudata.h index 445026fbec35..98a6e609163e 100644 --- a/include/asm-sparc64/cpudata.h +++ b/include/asm-sparc64/cpudata.h @@ -19,7 +19,7 @@ typedef struct { unsigned int __softirq_pending; /* must be 1st, see rtrap.S */ unsigned int __pad0; unsigned long clock_tick; /* %tick's per second */ - unsigned long udelay_val; + unsigned long __pad; unsigned int __pad1; unsigned int __pad2; @@ -80,7 +80,8 @@ struct trap_per_cpu { unsigned int dev_mondo_qmask; unsigned int resum_qmask; unsigned int nonresum_qmask; - unsigned int __pad2[3]; + unsigned int __pad2[1]; + void *hdesc; } __attribute__((aligned(64))); extern struct trap_per_cpu trap_block[NR_CPUS]; extern void init_cur_cpu_trap(struct thread_info *); diff --git a/include/asm-sparc64/delay.h b/include/asm-sparc64/delay.h index a4aae6f80627..a77aa622d762 100644 --- a/include/asm-sparc64/delay.h +++ b/include/asm-sparc64/delay.h @@ -1,37 +1,17 @@ /* delay.h: Linux delay routines on sparc64. * - * Copyright (C) 1996, 2004 David S. Miller (davem@davemloft.net). - * - * Based heavily upon x86 variant which is: - * Copyright (C) 1993 Linus Torvalds - * - * Delay routines calling functions in arch/sparc64/lib/delay.c + * Copyright (C) 1996, 2004, 2007 David S. Miller (davem@davemloft.net). */ -#ifndef __SPARC64_DELAY_H -#define __SPARC64_DELAY_H - -#include <linux/param.h> -#include <asm/cpudata.h> +#ifndef _SPARC64_DELAY_H +#define _SPARC64_DELAY_H #ifndef __ASSEMBLY__ -extern void __bad_udelay(void); -extern void __bad_ndelay(void); - -extern void __udelay(unsigned long usecs); -extern void __ndelay(unsigned long nsecs); -extern void __const_udelay(unsigned long usecs); extern void __delay(unsigned long loops); - -#define udelay(n) (__builtin_constant_p(n) ? \ - ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ - __udelay(n)) - -#define ndelay(n) (__builtin_constant_p(n) ? \ - ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ - __ndelay(n)) +extern void udelay(unsigned long usecs); +#define mdelay(n) udelay((n) * 1000) #endif /* !__ASSEMBLY__ */ -#endif /* defined(__SPARC64_DELAY_H) */ +#endif /* _SPARC64_DELAY_H */ diff --git a/include/asm-sparc64/hvtramp.h b/include/asm-sparc64/hvtramp.h new file mode 100644 index 000000000000..c7dd6ad056df --- /dev/null +++ b/include/asm-sparc64/hvtramp.h @@ -0,0 +1,37 @@ +#ifndef _SPARC64_HVTRAP_H +#define _SPARC64_HVTRAP_H + +#ifndef __ASSEMBLY__ + +#include <linux/types.h> + +struct hvtramp_mapping { + __u64 vaddr; + __u64 tte; +}; + +struct hvtramp_descr { + __u32 cpu; + __u32 num_mappings; + __u64 fault_info_va; + __u64 fault_info_pa; + __u64 thread_reg; + struct hvtramp_mapping maps[2]; +}; + +extern void hv_cpu_startup(unsigned long hvdescr_pa); + +#endif + +#define HVTRAMP_DESCR_CPU 0x00 +#define HVTRAMP_DESCR_NUM_MAPPINGS 0x04 +#define HVTRAMP_DESCR_FAULT_INFO_VA 0x08 +#define HVTRAMP_DESCR_FAULT_INFO_PA 0x10 +#define HVTRAMP_DESCR_THREAD_REG 0x18 +#define HVTRAMP_DESCR_MAPS 0x20 + +#define HVTRAMP_MAPPING_VADDR 0x00 +#define HVTRAMP_MAPPING_TTE 0x08 +#define HVTRAMP_MAPPING_SIZE 0x10 + +#endif /* _SPARC64_HVTRAP_H */ diff --git a/include/asm-sparc64/hypervisor.h b/include/asm-sparc64/hypervisor.h index db2130a95d68..524d49835dfd 100644 --- a/include/asm-sparc64/hypervisor.h +++ b/include/asm-sparc64/hypervisor.h @@ -98,7 +98,7 @@ #define HV_FAST_MACH_EXIT 0x00 #ifndef __ASSEMBLY__ -extern void sun4v_mach_exit(unsigned long exit_core); +extern void sun4v_mach_exit(unsigned long exit_code); #endif /* Domain services. */ diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h index 90781e34a95c..e6c436ef9356 100644 --- a/include/asm-sparc64/irq.h +++ b/include/asm-sparc64/irq.h @@ -53,6 +53,8 @@ extern unsigned int sun4v_build_msi(u32 devhandle, unsigned int *virt_irq_p, extern void sun4v_destroy_msi(unsigned int virt_irq); extern unsigned int sbus_build_irq(void *sbus, unsigned int ino); +extern void fixup_irqs(void); + static __inline__ void set_softint(unsigned long bits) { __asm__ __volatile__("wr %0, 0x0, %%set_softint" diff --git a/include/asm-sparc64/ldc.h b/include/asm-sparc64/ldc.h new file mode 100644 index 000000000000..bdb524a7b814 --- /dev/null +++ b/include/asm-sparc64/ldc.h @@ -0,0 +1,138 @@ +#ifndef _SPARC64_LDC_H +#define _SPARC64_LDC_H + +#include <asm/hypervisor.h> + +extern int ldom_domaining_enabled; +extern void ldom_set_var(const char *var, const char *value); +extern void ldom_reboot(const char *boot_command); +extern void ldom_power_off(void); + +/* The event handler will be evoked when link state changes + * or data becomes available on the receive side. + * + * For non-RAW links, if the LDC_EVENT_RESET event arrives the + * driver should reset all of it's internal state and reinvoke + * ldc_connect() to try and bring the link up again. + * + * For RAW links, ldc_connect() is not used. Instead the driver + * just waits for the LDC_EVENT_UP event. + */ +struct ldc_channel_config { + void (*event)(void *arg, int event); + + u32 mtu; + unsigned int rx_irq; + unsigned int tx_irq; + u8 mode; +#define LDC_MODE_RAW 0x00 +#define LDC_MODE_UNRELIABLE 0x01 +#define LDC_MODE_RESERVED 0x02 +#define LDC_MODE_STREAM 0x03 + + u8 debug; +#define LDC_DEBUG_HS 0x01 +#define LDC_DEBUG_STATE 0x02 +#define LDC_DEBUG_RX 0x04 +#define LDC_DEBUG_TX 0x08 +#define LDC_DEBUG_DATA 0x10 +}; + +#define LDC_EVENT_RESET 0x01 +#define LDC_EVENT_UP 0x02 +#define LDC_EVENT_DATA_READY 0x04 + +#define LDC_STATE_INVALID 0x00 +#define LDC_STATE_INIT 0x01 +#define LDC_STATE_BOUND 0x02 +#define LDC_STATE_READY 0x03 +#define LDC_STATE_CONNECTED 0x04 + +struct ldc_channel; + +/* Allocate state for a channel. */ +extern struct ldc_channel *ldc_alloc(unsigned long id, + const struct ldc_channel_config *cfgp, + void *event_arg); + +/* Shut down and free state for a channel. */ +extern void ldc_free(struct ldc_channel *lp); + +/* Register TX and RX queues of the link with the hypervisor. */ +extern int ldc_bind(struct ldc_channel *lp, const char *name); + +/* For non-RAW protocols we need to complete a handshake before + * communication can proceed. ldc_connect() does that, if the + * handshake completes successfully, an LDC_EVENT_UP event will + * be sent up to the driver. + */ +extern int ldc_connect(struct ldc_channel *lp); +extern int ldc_disconnect(struct ldc_channel *lp); + +extern int ldc_state(struct ldc_channel *lp); + +/* Read and write operations. Only valid when the link is up. */ +extern int ldc_write(struct ldc_channel *lp, const void *buf, + unsigned int size); +extern int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size); + +#define LDC_MAP_SHADOW 0x01 +#define LDC_MAP_DIRECT 0x02 +#define LDC_MAP_IO 0x04 +#define LDC_MAP_R 0x08 +#define LDC_MAP_W 0x10 +#define LDC_MAP_X 0x20 +#define LDC_MAP_RW (LDC_MAP_R | LDC_MAP_W) +#define LDC_MAP_RWX (LDC_MAP_R | LDC_MAP_W | LDC_MAP_X) +#define LDC_MAP_ALL 0x03f + +struct ldc_trans_cookie { + u64 cookie_addr; + u64 cookie_size; +}; + +struct scatterlist; +extern int ldc_map_sg(struct ldc_channel *lp, + struct scatterlist *sg, int num_sg, + struct ldc_trans_cookie *cookies, int ncookies, + unsigned int map_perm); + +extern int ldc_map_single(struct ldc_channel *lp, + void *buf, unsigned int len, + struct ldc_trans_cookie *cookies, int ncookies, + unsigned int map_perm); + +extern void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies, + int ncookies); + +extern int ldc_copy(struct ldc_channel *lp, int copy_dir, + void *buf, unsigned int len, unsigned long offset, + struct ldc_trans_cookie *cookies, int ncookies); + +static inline int ldc_get_dring_entry(struct ldc_channel *lp, + void *buf, unsigned int len, + unsigned long offset, + struct ldc_trans_cookie *cookies, + int ncookies) +{ + return ldc_copy(lp, LDC_COPY_IN, buf, len, offset, cookies, ncookies); +} + +static inline int ldc_put_dring_entry(struct ldc_channel *lp, + void *buf, unsigned int len, + unsigned long offset, + struct ldc_trans_cookie *cookies, + int ncookies) +{ + return ldc_copy(lp, LDC_COPY_OUT, buf, len, offset, cookies, ncookies); +} + +extern void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len, + struct ldc_trans_cookie *cookies, + int *ncookies, unsigned int map_perm); + +extern void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, + unsigned int len, + struct ldc_trans_cookie *cookies, int ncookies); + +#endif /* _SPARC64_LDC_H */ diff --git a/include/asm-sparc64/mdesc.h b/include/asm-sparc64/mdesc.h index c6383982b53d..e97c43133752 100644 --- a/include/asm-sparc64/mdesc.h +++ b/include/asm-sparc64/mdesc.h @@ -2,38 +2,66 @@ #define _SPARC64_MDESC_H #include <linux/types.h> +#include <linux/cpumask.h> #include <asm/prom.h> -struct mdesc_node; -struct mdesc_arc { - const char *name; - struct mdesc_node *arc; -}; - -struct mdesc_node { - const char *name; - u64 node; - unsigned int unique_id; - unsigned int num_arcs; - unsigned int irqs[2]; - struct property *properties; - struct mdesc_node *hash_next; - struct mdesc_node *allnodes_next; - struct mdesc_arc arcs[0]; -}; - -extern struct mdesc_node *md_find_node_by_name(struct mdesc_node *from, - const char *name); -#define md_for_each_node_by_name(__mn, __name) \ - for (__mn = md_find_node_by_name(NULL, __name); __mn; \ - __mn = md_find_node_by_name(__mn, __name)) - -extern struct property *md_find_property(const struct mdesc_node *mp, - const char *name, - int *lenp); -extern const void *md_get_property(const struct mdesc_node *mp, - const char *name, - int *lenp); +struct mdesc_handle; + +/* Machine description operations are to be surrounded by grab and + * release calls. The mdesc_handle returned from the grab is + * the first argument to all of the operational calls that work + * on mdescs. + */ +extern struct mdesc_handle *mdesc_grab(void); +extern void mdesc_release(struct mdesc_handle *); + +#define MDESC_NODE_NULL (~(u64)0) + +extern u64 mdesc_node_by_name(struct mdesc_handle *handle, + u64 from_node, const char *name); +#define mdesc_for_each_node_by_name(__hdl, __node, __name) \ + for (__node = mdesc_node_by_name(__hdl, MDESC_NODE_NULL, __name); \ + (__node) != MDESC_NODE_NULL; \ + __node = mdesc_node_by_name(__hdl, __node, __name)) + +/* Access to property values returned from mdesc_get_property() are + * only valid inside of a mdesc_grab()/mdesc_release() sequence. + * Once mdesc_release() is called, the memory backed up by these + * pointers may reference freed up memory. + * + * Therefore callers must make copies of any property values + * they need. + * + * These same rules apply to mdesc_node_name(). + */ +extern const void *mdesc_get_property(struct mdesc_handle *handle, + u64 node, const char *name, int *lenp); +extern const char *mdesc_node_name(struct mdesc_handle *hp, u64 node); + +/* MD arc iteration, the standard sequence is: + * + * unsigned long arc; + * mdesc_for_each_arc(arc, handle, node, MDESC_ARC_TYPE_{FWD,BACK}) { + * unsigned long target = mdesc_arc_target(handle, arc); + * ... + * } + */ + +#define MDESC_ARC_TYPE_FWD "fwd" +#define MDESC_ARC_TYPE_BACK "back" + +extern u64 mdesc_next_arc(struct mdesc_handle *handle, u64 from, + const char *arc_type); +#define mdesc_for_each_arc(__arc, __hdl, __node, __type) \ + for (__arc = mdesc_next_arc(__hdl, __node, __type); \ + (__arc) != MDESC_NODE_NULL; \ + __arc = mdesc_next_arc(__hdl, __arc, __type)) + +extern u64 mdesc_arc_target(struct mdesc_handle *hp, u64 arc); + +extern void mdesc_update(void); + +extern void mdesc_fill_in_cpu_data(cpumask_t mask); extern void sun4v_mdesc_init(void); diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h index 8d129032013e..9fc225ed5500 100644 --- a/include/asm-sparc64/mmu_context.h +++ b/include/asm-sparc64/mmu_context.h @@ -76,6 +76,9 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str unsigned long ctx_valid, flags; int cpu; + if (unlikely(mm == &init_mm)) + return; + spin_lock_irqsave(&mm->context.lock, flags); ctx_valid = CTX_VALID(mm->context); if (!ctx_valid) diff --git a/include/asm-sparc64/power.h b/include/asm-sparc64/power.h new file mode 100644 index 000000000000..94495c1ac4f6 --- /dev/null +++ b/include/asm-sparc64/power.h @@ -0,0 +1,7 @@ +#ifndef _SPARC64_POWER_H +#define _SPARC64_POWER_H + +extern void wake_up_powerd(void); +extern int start_powerd(void); + +#endif /* !(_SPARC64_POWER_H) */ diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h index 4fb8c4bfb848..e8a96a31761b 100644 --- a/include/asm-sparc64/smp.h +++ b/include/asm-sparc64/smp.h @@ -29,9 +29,6 @@ #include <asm/bitops.h> #include <asm/atomic.h> -extern cpumask_t phys_cpu_present_map; -#define cpu_possible_map phys_cpu_present_map - extern cpumask_t cpu_sibling_map[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; extern int sparc64_multi_core; @@ -44,7 +41,12 @@ extern int hard_smp_processor_id(void); #define raw_smp_processor_id() (current_thread_info()->cpu) extern void smp_fill_in_sib_core_maps(void); -extern unsigned char boot_cpu_id; +extern void cpu_play_dead(void); + +#ifdef CONFIG_HOTPLUG_CPU +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); +#endif #endif /* !(__ASSEMBLY__) */ @@ -52,7 +54,6 @@ extern unsigned char boot_cpu_id; #define hard_smp_processor_id() 0 #define smp_fill_in_sib_core_maps() do { } while (0) -#define boot_cpu_id (0) #endif /* !(CONFIG_SMP) */ diff --git a/include/asm-sparc64/vio.h b/include/asm-sparc64/vio.h new file mode 100644 index 000000000000..83c96422e9d6 --- /dev/null +++ b/include/asm-sparc64/vio.h @@ -0,0 +1,404 @@ +#ifndef _SPARC64_VIO_H +#define _SPARC64_VIO_H + +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/mod_devicetable.h> +#include <linux/timer.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/list.h> + +#include <asm/ldc.h> +#include <asm/mdesc.h> + +struct vio_msg_tag { + u8 type; +#define VIO_TYPE_CTRL 0x01 +#define VIO_TYPE_DATA 0x02 +#define VIO_TYPE_ERR 0x04 + + u8 stype; +#define VIO_SUBTYPE_INFO 0x01 +#define VIO_SUBTYPE_ACK 0x02 +#define VIO_SUBTYPE_NACK 0x04 + + u16 stype_env; +#define VIO_VER_INFO 0x0001 +#define VIO_ATTR_INFO 0x0002 +#define VIO_DRING_REG 0x0003 +#define VIO_DRING_UNREG 0x0004 +#define VIO_RDX 0x0005 +#define VIO_PKT_DATA 0x0040 +#define VIO_DESC_DATA 0x0041 +#define VIO_DRING_DATA 0x0042 +#define VNET_MCAST_INFO 0x0101 + + u32 sid; +}; + +struct vio_rdx { + struct vio_msg_tag tag; + u64 resv[6]; +}; + +struct vio_ver_info { + struct vio_msg_tag tag; + u16 major; + u16 minor; + u8 dev_class; +#define VDEV_NETWORK 0x01 +#define VDEV_NETWORK_SWITCH 0x02 +#define VDEV_DISK 0x03 +#define VDEV_DISK_SERVER 0x04 + + u8 resv1[3]; + u64 resv2[5]; +}; + +struct vio_dring_register { + struct vio_msg_tag tag; + u64 dring_ident; + u32 num_descr; + u32 descr_size; + u16 options; +#define VIO_TX_DRING 0x0001 +#define VIO_RX_DRING 0x0002 + u16 resv; + u32 num_cookies; + struct ldc_trans_cookie cookies[0]; +}; + +struct vio_dring_unregister { + struct vio_msg_tag tag; + u64 dring_ident; + u64 resv[5]; +}; + +/* Data transfer modes */ +#define VIO_PKT_MODE 0x01 /* Packet based transfer */ +#define VIO_DESC_MODE 0x02 /* In-band descriptors */ +#define VIO_DRING_MODE 0x03 /* Descriptor rings */ + +struct vio_dring_data { + struct vio_msg_tag tag; + u64 seq; + u64 dring_ident; + u32 start_idx; + u32 end_idx; + u8 state; +#define VIO_DRING_ACTIVE 0x01 +#define VIO_DRING_STOPPED 0x02 + + u8 __pad1; + u16 __pad2; + u32 __pad3; + u64 __par4[2]; +}; + +struct vio_dring_hdr { + u8 state; +#define VIO_DESC_FREE 0x01 +#define VIO_DESC_READY 0x02 +#define VIO_DESC_ACCEPTED 0x03 +#define VIO_DESC_DONE 0x04 + u8 ack; +#define VIO_ACK_ENABLE 0x01 +#define VIO_ACK_DISABLE 0x00 + + u16 __pad1; + u32 __pad2; +}; + +/* VIO disk specific structures and defines */ +struct vio_disk_attr_info { + struct vio_msg_tag tag; + u8 xfer_mode; + u8 vdisk_type; +#define VD_DISK_TYPE_SLICE 0x01 /* Slice in block device */ +#define VD_DISK_TYPE_DISK 0x02 /* Entire block device */ + u16 resv1; + u32 vdisk_block_size; + u64 operations; + u64 vdisk_size; + u64 max_xfer_size; + u64 resv2[2]; +}; + +struct vio_disk_desc { + struct vio_dring_hdr hdr; + u64 req_id; + u8 operation; +#define VD_OP_BREAD 0x01 /* Block read */ +#define VD_OP_BWRITE 0x02 /* Block write */ +#define VD_OP_FLUSH 0x03 /* Flush disk contents */ +#define VD_OP_GET_WCE 0x04 /* Get write-cache status */ +#define VD_OP_SET_WCE 0x05 /* Enable/disable write-cache */ +#define VD_OP_GET_VTOC 0x06 /* Get VTOC */ +#define VD_OP_SET_VTOC 0x07 /* Set VTOC */ +#define VD_OP_GET_DISKGEOM 0x08 /* Get disk geometry */ +#define VD_OP_SET_DISKGEOM 0x09 /* Set disk geometry */ +#define VD_OP_SCSICMD 0x0a /* SCSI control command */ +#define VD_OP_GET_DEVID 0x0b /* Get device ID */ +#define VD_OP_GET_EFI 0x0c /* Get EFI */ +#define VD_OP_SET_EFI 0x0d /* Set EFI */ + u8 slice; + u16 resv1; + u32 status; + u64 offset; + u64 size; + u32 ncookies; + u32 resv2; + struct ldc_trans_cookie cookies[0]; +}; + +#define VIO_DISK_VNAME_LEN 8 +#define VIO_DISK_ALABEL_LEN 128 +#define VIO_DISK_NUM_PART 8 + +struct vio_disk_vtoc { + u8 volume_name[VIO_DISK_VNAME_LEN]; + u16 sector_size; + u16 num_partitions; + u8 ascii_label[VIO_DISK_ALABEL_LEN]; + struct { + u16 id; + u16 perm_flags; + u32 resv; + u64 start_block; + u64 num_blocks; + } partitions[VIO_DISK_NUM_PART]; +}; + +struct vio_disk_geom { + u16 num_cyl; /* Num data cylinders */ + u16 alt_cyl; /* Num alternate cylinders */ + u16 beg_cyl; /* Cyl off of fixed head area */ + u16 num_hd; /* Num heads */ + u16 num_sec; /* Num sectors */ + u16 ifact; /* Interleave factor */ + u16 apc; /* Alts per cylinder (SCSI) */ + u16 rpm; /* Revolutions per minute */ + u16 phy_cyl; /* Num physical cylinders */ + u16 wr_skip; /* Num sects to skip, writes */ + u16 rd_skip; /* Num sects to skip, writes */ +}; + +struct vio_disk_devid { + u16 resv; + u16 type; + u32 len; + char id[0]; +}; + +struct vio_disk_efi { + u64 lba; + u64 len; + char data[0]; +}; + +/* VIO net specific structures and defines */ +struct vio_net_attr_info { + struct vio_msg_tag tag; + u8 xfer_mode; + u8 addr_type; +#define VNET_ADDR_ETHERMAC 0x01 + u16 ack_freq; + u32 resv1; + u64 addr; + u64 mtu; + u64 resv2[3]; +}; + +#define VNET_NUM_MCAST 7 + +struct vio_net_mcast_info { + struct vio_msg_tag tag; + u8 set; + u8 count; + u8 mcast_addr[VNET_NUM_MCAST * 6]; + u32 resv; +}; + +struct vio_net_desc { + struct vio_dring_hdr hdr; + u32 size; + u32 ncookies; + struct ldc_trans_cookie cookies[0]; +}; + +#define VIO_MAX_RING_COOKIES 24 + +struct vio_dring_state { + u64 ident; + void *base; + u64 snd_nxt; + u64 rcv_nxt; + u32 entry_size; + u32 num_entries; + u32 prod; + u32 cons; + u32 pending; + int ncookies; + struct ldc_trans_cookie cookies[VIO_MAX_RING_COOKIES]; +}; + +static inline void *vio_dring_cur(struct vio_dring_state *dr) +{ + return dr->base + (dr->entry_size * dr->prod); +} + +static inline void *vio_dring_entry(struct vio_dring_state *dr, + unsigned int index) +{ + return dr->base + (dr->entry_size * index); +} + +static inline u32 vio_dring_avail(struct vio_dring_state *dr, + unsigned int ring_size) +{ + /* Ensure build-time power-of-2. */ + BUILD_BUG_ON(ring_size & (ring_size - 1)); + + return (dr->pending - + ((dr->prod - dr->cons) & (ring_size - 1))); +} + +#define VIO_MAX_TYPE_LEN 64 +#define VIO_MAX_COMPAT_LEN 64 + +struct vio_dev { + u64 mp; + struct device_node *dp; + + char type[VIO_MAX_TYPE_LEN]; + char compat[VIO_MAX_COMPAT_LEN]; + int compat_len; + + unsigned long channel_id; + + unsigned int tx_irq; + unsigned int rx_irq; + + struct device dev; +}; + +struct vio_driver { + struct list_head node; + const struct vio_device_id *id_table; + int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); + int (*remove)(struct vio_dev *dev); + void (*shutdown)(struct vio_dev *dev); + unsigned long driver_data; + struct device_driver driver; +}; + +struct vio_version { + u16 major; + u16 minor; +}; + +struct vio_driver_state; +struct vio_driver_ops { + int (*send_attr)(struct vio_driver_state *vio); + int (*handle_attr)(struct vio_driver_state *vio, void *pkt); + void (*handshake_complete)(struct vio_driver_state *vio); +}; + +struct vio_completion { + struct completion com; + int err; + int waiting_for; +}; + +struct vio_driver_state { + /* Protects VIO handshake and, optionally, driver private state. */ + spinlock_t lock; + + struct ldc_channel *lp; + + u32 _peer_sid; + u32 _local_sid; + struct vio_dring_state drings[2]; +#define VIO_DRIVER_TX_RING 0 +#define VIO_DRIVER_RX_RING 1 + + u8 hs_state; +#define VIO_HS_INVALID 0x00 +#define VIO_HS_GOTVERS 0x01 +#define VIO_HS_GOT_ATTR 0x04 +#define VIO_HS_SENT_DREG 0x08 +#define VIO_HS_SENT_RDX 0x10 +#define VIO_HS_GOT_RDX_ACK 0x20 +#define VIO_HS_GOT_RDX 0x40 +#define VIO_HS_SENT_RDX_ACK 0x80 +#define VIO_HS_COMPLETE (VIO_HS_GOT_RDX_ACK | VIO_HS_SENT_RDX_ACK) + + u8 dev_class; + + u8 dr_state; +#define VIO_DR_STATE_TXREG 0x01 +#define VIO_DR_STATE_RXREG 0x02 +#define VIO_DR_STATE_TXREQ 0x10 +#define VIO_DR_STATE_RXREQ 0x20 + + u8 debug; +#define VIO_DEBUG_HS 0x01 +#define VIO_DEBUG_DATA 0x02 + + void *desc_buf; + unsigned int desc_buf_len; + + struct vio_completion *cmp; + + struct vio_dev *vdev; + + struct timer_list timer; + + struct vio_version ver; + + struct vio_version *ver_table; + int ver_table_entries; + + char *name; + + struct vio_driver_ops *ops; +}; + +#define viodbg(TYPE, f, a...) \ +do { if (vio->debug & VIO_DEBUG_##TYPE) \ + printk(KERN_INFO "vio: ID[%lu] " f, \ + vio->vdev->channel_id, ## a); \ +} while (0) + +extern int vio_register_driver(struct vio_driver *drv); +extern void vio_unregister_driver(struct vio_driver *drv); + +static inline struct vio_driver *to_vio_driver(struct device_driver *drv) +{ + return container_of(drv, struct vio_driver, driver); +} + +static inline struct vio_dev *to_vio_dev(struct device *dev) +{ + return container_of(dev, struct vio_dev, dev); +} + +extern int vio_ldc_send(struct vio_driver_state *vio, void *data, int len); +extern void vio_link_state_change(struct vio_driver_state *vio, int event); +extern void vio_conn_reset(struct vio_driver_state *vio); +extern int vio_control_pkt_engine(struct vio_driver_state *vio, void *pkt); +extern int vio_validate_sid(struct vio_driver_state *vio, + struct vio_msg_tag *tp); +extern u32 vio_send_sid(struct vio_driver_state *vio); +extern int vio_ldc_alloc(struct vio_driver_state *vio, + struct ldc_channel_config *base_cfg, void *event_arg); +extern void vio_ldc_free(struct vio_driver_state *vio); +extern int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev, + u8 dev_class, struct vio_version *ver_table, + int ver_table_size, struct vio_driver_ops *ops, + char *name); + +extern void vio_port_up(struct vio_driver_state *vio); + +#endif /* _SPARC64_VIO_H */ |