From 1e66df3ee301209f4a38df097d7cc5cb9b367a3f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:02 -0700 Subject: add kstrndup Add a kstrndup function, modelled on strndup. Like strndup this returns a string copied into its own allocated memory, but it copies no more than the specified number of bytes from the source. Remove private strndup() from irda code. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Andrew Morton Cc: Randy Dunlap Cc: YOSHIFUJI Hideaki Cc: Akinobu Mita Cc: Arnaldo Carvalho de Melo Cc: Al Viro Cc: Panagiotis Issaris Cc: Rene Scharfe --- include/linux/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 7f2eb6a477f9..ee5e9ccc4aae 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -105,6 +105,7 @@ extern void * memchr(const void *,int,__kernel_size_t); #endif extern char *kstrdup(const char *s, gfp_t gfp); +extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); #ifdef __cplusplus -- cgit v1.2.3 From d84d1cc7647c7e4f77d517e2d87b4a106a0420d9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:02 -0700 Subject: add argv_split() argv_split() is a helper function which takes a string, splits it at whitespace, and returns a NULL-terminated argv vector. This is deliberately simple - it does no quote processing of any kind. [ Seems to me that this is something which is already being done in the kernel, but I couldn't find any other implementations, either to steal or replace. Keep an eye out. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Andrew Morton Cc: Randy Dunlap --- include/linux/string.h | 3 ++ lib/Makefile | 2 +- lib/argv_split.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 lib/argv_split.c (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index ee5e9ccc4aae..836062b7582a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -108,6 +108,9 @@ extern char *kstrdup(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); +extern char **argv_split(gfp_t gfp, const char *str, int *argcp); +extern void argv_free(char **argv); + #ifdef __cplusplus } #endif diff --git a/lib/Makefile b/lib/Makefile index da68b2ca0606..614966387402 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -5,7 +5,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ - sha1.o irq_regs.o reciprocal_div.o + sha1.o irq_regs.o reciprocal_div.o argv_split.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/argv_split.c b/lib/argv_split.c new file mode 100644 index 000000000000..4096ed42f490 --- /dev/null +++ b/lib/argv_split.c @@ -0,0 +1,105 @@ +/* + * Helper function for splitting a string into an argv-like array. + */ + +#include +#include +#include + +static const char *skip_sep(const char *cp) +{ + while (*cp && isspace(*cp)) + cp++; + + return cp; +} + +static const char *skip_arg(const char *cp) +{ + while (*cp && !isspace(*cp)) + cp++; + + return cp; +} + +static int count_argc(const char *str) +{ + int count = 0; + + while (*str) { + str = skip_sep(str); + if (*str) { + count++; + str = skip_arg(str); + } + } + + return count; +} + +/** + * argv_free - free an argv + * @argv - the argument vector to be freed + * + * Frees an argv and the strings it points to. + */ +void argv_free(char **argv) +{ + char **p; + for (p = argv; *p; p++) + kfree(*p); + + kfree(argv); +} +EXPORT_SYMBOL(argv_free); + +/** + * argv_split - split a string at whitespace, returning an argv + * @gfp: the GFP mask used to allocate memory + * @str: the string to be split + * @argcp: returned argument count + * + * Returns an array of pointers to strings which are split out from + * @str. This is performed by strictly splitting on white-space; no + * quote processing is performed. Multiple whitespace characters are + * considered to be a single argument separator. The returned array + * is always NULL-terminated. Returns NULL on memory allocation + * failure. + */ +char **argv_split(gfp_t gfp, const char *str, int *argcp) +{ + int argc = count_argc(str); + char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp); + char **argvp; + + if (argv == NULL) + goto out; + + *argcp = argc; + argvp = argv; + + while (*str) { + str = skip_sep(str); + + if (*str) { + const char *p = str; + char *t; + + str = skip_arg(str); + + t = kstrndup(p, str-p, gfp); + if (t == NULL) + goto fail; + *argvp++ = t; + } + } + *argvp = NULL; + + out: + return argv; + + fail: + argv_free(argv); + return NULL; +} +EXPORT_SYMBOL(argv_split); -- cgit v1.2.3 From 0ab4dc92278a0f3816e486d6350c6652a72e06c8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:02 -0700 Subject: usermodehelper: split setup from execution Rather than having hundreds of variations of call_usermodehelper for various pieces of usermode state which could be set up, split the info allocation and initialization from the actual process execution. This means the general pattern becomes: info = call_usermodehelper_setup(path, argv, envp); /* basic state */ call_usermodehelper_(info, stuff...); /* extra state */ call_usermodehelper_exec(info, wait); /* run process and free info */ This patch introduces wrappers for all the existing calling styles for call_usermodehelper_*, but folds their implementations into one. Signed-off-by: Jeremy Fitzhardinge Cc: Andi Kleen Cc: Rusty Russell Cc: David Howells Cc: Bj?rn Steinbrink Cc: Randy Dunlap --- include/linux/kmod.h | 44 +++++++++++- kernel/kmod.c | 191 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 176 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 10f505c8431d..c4cbe59d9c67 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -36,13 +36,51 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; } #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x))) struct key; -extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[], - struct key *session_keyring, int wait); +struct file; +struct subprocess_info; + +/* Allocate a subprocess_info structure */ +struct subprocess_info *call_usermodehelper_setup(char *path, + char **argv, char **envp); + +/* Set various pieces of state into the subprocess_info structure */ +void call_usermodehelper_setkeys(struct subprocess_info *info, + struct key *session_keyring); +int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, + struct file **filp); +void call_usermodehelper_setcleanup(struct subprocess_info *info, + void (*cleanup)(char **argv, char **envp)); + +/* Actually execute the sub-process */ +int call_usermodehelper_exec(struct subprocess_info *info, int wait); + +/* Free the subprocess_info. This is only needed if you're not going + to call call_usermodehelper_exec */ +void call_usermodehelper_freeinfo(struct subprocess_info *info); static inline int call_usermodehelper(char *path, char **argv, char **envp, int wait) { - return call_usermodehelper_keys(path, argv, envp, NULL, wait); + struct subprocess_info *info; + + info = call_usermodehelper_setup(path, argv, envp); + if (info == NULL) + return -ENOMEM; + return call_usermodehelper_exec(info, wait); +} + +static inline int +call_usermodehelper_keys(char *path, char **argv, char **envp, + struct key *session_keyring, int wait) +{ + struct subprocess_info *info; + + info = call_usermodehelper_setup(path, argv, envp); + if (info == NULL) + return -ENOMEM; + + call_usermodehelper_setkeys(info, session_keyring); + return call_usermodehelper_exec(info, wait); } extern void usermodehelper_init(void); diff --git a/kernel/kmod.c b/kernel/kmod.c index 4d32eb077179..d2dce71115d8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -122,6 +122,7 @@ struct subprocess_info { int wait; int retval; struct file *stdin; + void (*cleanup)(char **argv, char **envp); }; /* @@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data) do_exit(0); } +void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info->argv, info->envp); + kfree(info); +} +EXPORT_SYMBOL(call_usermodehelper_freeinfo); + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -217,7 +226,7 @@ static int wait_for_helper(void *data) } if (sub_info->wait < 0) - kfree(sub_info); + call_usermodehelper_freeinfo(sub_info); else complete(sub_info->complete); return 0; @@ -252,11 +261,94 @@ static void __call_usermodehelper(struct work_struct *work) } /** - * call_usermodehelper_keys - start a usermode application - * @path: pathname for the application - * @argv: null-terminated argument list - * @envp: null-terminated environment list - * @session_keyring: session keyring for process (NULL for an empty keyring) + * call_usermodehelper_setup - prepare to call a usermode helper + * @path - path to usermode executable + * @argv - arg vector for process + * @envp - environment for process + * + * Returns either NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + */ +struct subprocess_info *call_usermodehelper_setup(char *path, + char **argv, char **envp) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_setkeys - set the session keys for usermode helper + * @info: a subprocess_info returned by call_usermodehelper_setup + * @session_keyring: the session keyring for the process + */ +void call_usermodehelper_setkeys(struct subprocess_info *info, + struct key *session_keyring) +{ + info->ring = session_keyring; +} +EXPORT_SYMBOL(call_usermodehelper_setkeys); + +/** + * call_usermodehelper_setcleanup - set a cleanup function + * @info: a subprocess_info returned by call_usermodehelper_setup + * @cleanup: a cleanup function + * + * The cleanup function is just befor ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +void call_usermodehelper_setcleanup(struct subprocess_info *info, + void (*cleanup)(char **argv, char **envp)) +{ + info->cleanup = cleanup; +} +EXPORT_SYMBOL(call_usermodehelper_setcleanup); + +/** + * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin + * @sub_info: a subprocess_info returned by call_usermodehelper_setup + * @filp: set to the write-end of a pipe + * + * This constructs a pipe, and sets the read end to be the stdin of the + * subprocess, and returns the write-end in *@filp. + */ +int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, + struct file **filp) +{ + struct file *f; + + f = create_write_pipe(); + if (IS_ERR(f)) + return PTR_ERR(f); + *filp = f; + + f = create_read_pipe(f); + if (IS_ERR(f)) { + free_write_pipe(*filp); + return PTR_ERR(f); + } + sub_info->stdin = f; + + return 0; +} +EXPORT_SYMBOL(call_usermodehelper_stdinpipe); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa * @wait: wait for the application to finish and return status. * when -1 don't wait at all, but you get no useful error back when * the program couldn't be exec'ed. This makes it safe to call @@ -265,33 +357,24 @@ static void __call_usermodehelper(struct work_struct *work) * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). - * - * Must be called from process context. Returns a negative error code - * if program was not execed successfully, or 0. */ -int call_usermodehelper_keys(char *path, char **argv, char **envp, - struct key *session_keyring, int wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, + int wait) { DECLARE_COMPLETION_ONSTACK(done); - struct subprocess_info *sub_info; int retval; - if (!khelper_wq) - return -EBUSY; - - if (path[0] == '\0') - return 0; + if (sub_info->path[0] == '\0') { + retval = 0; + goto out; + } - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); - if (!sub_info) - return -ENOMEM; + if (!khelper_wq) { + retval = -EBUSY; + goto out; + } - INIT_WORK(&sub_info->work, __call_usermodehelper); sub_info->complete = &done; - sub_info->path = path; - sub_info->argv = argv; - sub_info->envp = envp; - sub_info->ring = session_keyring; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); @@ -299,47 +382,43 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, return 0; wait_for_completion(&done); retval = sub_info->retval; - kfree(sub_info); + + out: + call_usermodehelper_freeinfo(sub_info); return retval; } -EXPORT_SYMBOL(call_usermodehelper_keys); +EXPORT_SYMBOL(call_usermodehelper_exec); +/** + * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @filp: set to the write-end of a pipe + * + * This is a simple wrapper which executes a usermode-helper function + * with a pipe as stdin. It is implemented entirely in terms of + * lower-level call_usermodehelper_* functions. + */ int call_usermodehelper_pipe(char *path, char **argv, char **envp, struct file **filp) { - DECLARE_COMPLETION(done); - struct subprocess_info sub_info = { - .work = __WORK_INITIALIZER(sub_info.work, - __call_usermodehelper), - .complete = &done, - .path = path, - .argv = argv, - .envp = envp, - .retval = 0, - }; - struct file *f; - - if (!khelper_wq) - return -EBUSY; + struct subprocess_info *sub_info; + int ret; - if (path[0] == '\0') - return 0; + sub_info = call_usermodehelper_setup(path, argv, envp); + if (sub_info == NULL) + return -ENOMEM; - f = create_write_pipe(); - if (IS_ERR(f)) - return PTR_ERR(f); - *filp = f; + ret = call_usermodehelper_stdinpipe(sub_info, filp); + if (ret < 0) + goto out; - f = create_read_pipe(f); - if (IS_ERR(f)) { - free_write_pipe(*filp); - return PTR_ERR(f); - } - sub_info.stdin = f; + return call_usermodehelper_exec(sub_info, 1); - queue_work(khelper_wq, &sub_info.work); - wait_for_completion(&done); - return sub_info.retval; + out: + call_usermodehelper_freeinfo(sub_info); + return ret; } EXPORT_SYMBOL(call_usermodehelper_pipe); -- cgit v1.2.3 From 10a0a8d4e3f6bf2d077f94344441909abe670f5a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:02 -0700 Subject: Add common orderly_poweroff() Various pieces of code around the kernel want to be able to trigger an orderly poweroff. This pulls them together into a single implementation. By default the poweroff command is /sbin/poweroff, but it can be set via sysctl: kernel/poweroff_cmd. This is split at whitespace, so it can include command-line arguments. This patch replaces four other instances of invoking either "poweroff" or "shutdown -h now": two sbus drivers, and acpi thermal management. sparc64 has its own "powerd"; still need to determine whether it should be replaced by orderly_poweroff(). Signed-off-by: Jeremy Fitzhardinge Acked-by: Len Brown Signed-off-by: Chris Wright Cc: Andrew Morton Cc: Randy Dunlap Cc: Andi Kleen Cc: Al Viro Cc: Arnd Bergmann Cc: David S. Miller --- drivers/acpi/thermal.c | 24 ++--------------- drivers/sbus/char/bbc_envctrl.c | 5 ++-- drivers/sbus/char/envctrl.c | 7 ++--- include/linux/reboot.h | 5 ++++ kernel/sys.c | 58 +++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 10 +++++++ 6 files changed, 79 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 88a6fc7fd271..58f1338981bc 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -59,7 +60,6 @@ #define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0 #define ACPI_THERMAL_NOTIFY_HOT 0xF1 #define ACPI_THERMAL_MODE_ACTIVE 0x00 -#define ACPI_THERMAL_PATH_POWEROFF "/sbin/poweroff" #define ACPI_THERMAL_MAX_ACTIVE 10 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65 @@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz) return 0; } -static int acpi_thermal_call_usermode(char *path) -{ - char *argv[2] = { NULL, NULL }; - char *envp[3] = { NULL, NULL, NULL }; - - - if (!path) - return -EINVAL; - - argv[0] = path; - - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - - call_usermodehelper(argv[0], argv, envp, 0); - - return 0; -} - static int acpi_thermal_critical(struct acpi_thermal *tz) { if (!tz || !tz->trips.critical.flags.valid) @@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz) acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL, tz->trips.critical.flags.enabled); - acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF); + orderly_poweroff(true); return 0; } diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c index a54e4140683a..e821a155b658 100644 --- a/drivers/sbus/char/bbc_envctrl.c +++ b/drivers/sbus/char/bbc_envctrl.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp) static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) { static int shutting_down = 0; - static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; - char *argv[] = { "/sbin/shutdown", "-h", "now", NULL }; char *type = "???"; s8 val = -1; @@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n"); shutting_down = 1; - if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0) + if (orderly_poweroff(true) < 0) printk(KERN_CRIT "envctrl: shutdown execution failed\n"); } diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c index 8328acab47fd..dadabef116b6 100644 --- a/drivers/sbus/char/envctrl.c +++ b/drivers/sbus/char/envctrl.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type) static void envctrl_do_shutdown(void) { static int inprog = 0; - static char *envp[] = { - "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; - char *argv[] = { - "/sbin/shutdown", "-h", "now", NULL }; int ret; if (inprog != 0) @@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void) inprog = 1; printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n"); - ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0); + ret = orderly_poweroff(true); if (ret < 0) { printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); inprog = 0; /* unlikely to succeed, but we could try again */ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 1dd1c707311f..85ea63f462af 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -67,6 +67,11 @@ extern void kernel_power_off(void); void ctrl_alt_del(void); +#define POWEROFF_CMD_PATH_LEN 256 +extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN]; + +extern int orderly_poweroff(bool force); + /* * Emergency restart, callable from an interrupt handler. */ diff --git a/kernel/sys.c b/kernel/sys.c index 4d141ae3e802..aeded9ad66ce 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, } return err ? -EFAULT : 0; } + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static void argv_cleanup(char **argv, char **envp) +{ + argv_free(argv); +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int argc; + char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret = -ENOMEM; + struct subprocess_info *info; + + if (argv == NULL) { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + goto out; + } + + info = call_usermodehelper_setup(argv[0], argv, envp); + if (info == NULL) { + argv_free(argv); + goto out; + } + + call_usermodehelper_setcleanup(info, argv_cleanup); + + ret = call_usermodehelper_exec(info, -1); + + out: + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + + /* I guess this should try to kick off some daemon to + sync and poweroff asap. Or not even bother syncing + if we're doing an emergency shutdown? */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7063ebc6db05..44a1d699aad7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -705,6 +706,15 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, { .ctl_name = 0 } }; -- cgit v1.2.3 From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: usermodehelper: Tidy up waiting Rather than using a tri-state integer for the wait flag in call_usermodehelper_exec, define a proper enum, and use that. I've preserved the integer values so that any callers I've missed should still work OK. Signed-off-by: Jeremy Fitzhardinge Cc: James Bottomley Cc: Randy Dunlap Cc: Christoph Hellwig Cc: Andi Kleen Cc: Paul Mackerras Cc: Johannes Berg Cc: Ralf Baechle Cc: Bjorn Helgaas Cc: Joel Becker Cc: Tony Luck Cc: Kay Sievers Cc: Srivatsa Vaddagiri Cc: Oleg Nesterov Cc: David Howells --- arch/i386/mach-voyager/voyager_thread.c | 2 +- arch/x86_64/kernel/mce.c | 2 +- drivers/macintosh/therm_pm72.c | 3 ++- drivers/macintosh/windfarm_core.c | 3 ++- drivers/net/hamradio/baycom_epp.c | 2 +- drivers/pnp/pnpbios/core.c | 2 +- fs/ocfs2/heartbeat.c | 2 +- include/linux/kmod.h | 12 +++++++++--- kernel/cpuset.c | 2 +- kernel/kmod.c | 27 ++++++++++++++++----------- kernel/sys.c | 2 +- lib/kobject_uevent.c | 2 +- net/bridge/br_stp_if.c | 2 +- security/keys/request_key.c | 3 ++- 14 files changed, 40 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c index b4b24e0e45e1..f9d595338159 100644 --- a/arch/i386/mach-voyager/voyager_thread.c +++ b/arch/i386/mach-voyager/voyager_thread.c @@ -52,7 +52,7 @@ execute(const char *string) NULL, }; - if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { + if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, ret); } diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index aa1d15991794..f3fb8174559e 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -174,7 +174,7 @@ static void do_mce_trigger(void) if (events != atomic_read(&mce_logged) && trigger[0]) { /* Small race window, but should be harmless. */ atomic_set(&mce_logged, events); - call_usermodehelper(trigger, trigger_argv, NULL, -1); + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); } } diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c index dbb22403979f..3d90fc002097 100644 --- a/drivers/macintosh/therm_pm72.c +++ b/drivers/macintosh/therm_pm72.c @@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void) "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; - return call_usermodehelper(critical_overtemp_path, argv, envp, 0); + return call_usermodehelper(critical_overtemp_path, + argv, envp, UMH_WAIT_EXEC); } diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c index e18d265d5d33..516d943227e2 100644 --- a/drivers/macintosh/windfarm_core.c +++ b/drivers/macintosh/windfarm_core.c @@ -80,7 +80,8 @@ int wf_critical_overtemp(void) "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; - return call_usermodehelper(critical_overtemp_path, argv, envp, 0); + return call_usermodehelper(critical_overtemp_path, + argv, envp, UMH_WAIT_EXEC); } EXPORT_SYMBOL_GPL(wf_critical_overtemp); diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 84aa2117c0ee..355c6cf3d112 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc) sprintf(portarg, "%ld", bc->pdev->port->base); printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg); - return call_usermodehelper(eppconfig_path, argv, envp, 1); + return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC); } /* ---------------------------------------------------------------------- */ diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index 03baf1c64a2e..ed112ee16012 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info) info->location_id, info->serial, info->capabilities); envp[i] = NULL; - value = call_usermodehelper (argv [0], argv, envp, 0); + value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC); kfree (buf); kfree (envp); return 0; diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 352eb4a13f98..c4c36171240d 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb) envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; - ret = call_usermodehelper(argv[0], argv, envp, 1); + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (ret < 0) mlog_errno(ret); } diff --git a/include/linux/kmod.h b/include/linux/kmod.h index c4cbe59d9c67..5dc13848891b 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -51,15 +51,21 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, void call_usermodehelper_setcleanup(struct subprocess_info *info, void (*cleanup)(char **argv, char **envp)); +enum umh_wait { + UMH_NO_WAIT = -1, /* don't wait at all */ + UMH_WAIT_EXEC = 0, /* wait for the exec, but not the process */ + UMH_WAIT_PROC = 1, /* wait for the process to complete */ +}; + /* Actually execute the sub-process */ -int call_usermodehelper_exec(struct subprocess_info *info, int wait); +int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait); /* Free the subprocess_info. This is only needed if you're not going to call call_usermodehelper_exec */ void call_usermodehelper_freeinfo(struct subprocess_info *info); static inline int -call_usermodehelper(char *path, char **argv, char **envp, int wait) +call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) { struct subprocess_info *info; @@ -71,7 +77,7 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait) static inline int call_usermodehelper_keys(char *path, char **argv, char **envp, - struct key *session_keyring, int wait) + struct key *session_keyring, enum umh_wait wait) { struct subprocess_info *info; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b4796d850140..57e6448b171e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); kfree(pathbuf); } diff --git a/kernel/kmod.c b/kernel/kmod.c index d2dce71115d8..78d365c524ed 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -119,7 +119,7 @@ struct subprocess_info { char **argv; char **envp; struct key *ring; - int wait; + enum umh_wait wait; int retval; struct file *stdin; void (*cleanup)(char **argv, char **envp); @@ -225,7 +225,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - if (sub_info->wait < 0) + if (sub_info->wait == UMH_NO_WAIT) call_usermodehelper_freeinfo(sub_info); else complete(sub_info->complete); @@ -238,26 +238,31 @@ static void __call_usermodehelper(struct work_struct *work) struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); pid_t pid; - int wait = sub_info->wait; + enum umh_wait wait = sub_info->wait; /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ - if (wait) + if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); else pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD); - if (wait < 0) - return; + switch (wait) { + case UMH_NO_WAIT: + break; - if (pid < 0) { + case UMH_WAIT_PROC: + if (pid > 0) + break; sub_info->retval = pid; + /* FALLTHROUGH */ + + case UMH_WAIT_EXEC: complete(sub_info->complete); - } else if (!wait) - complete(sub_info->complete); + } } /** @@ -359,7 +364,7 @@ EXPORT_SYMBOL(call_usermodehelper_stdinpipe); * (ie. it runs with full root capabilities). */ int call_usermodehelper_exec(struct subprocess_info *sub_info, - int wait) + enum umh_wait wait) { DECLARE_COMPLETION_ONSTACK(done); int retval; @@ -378,7 +383,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); - if (wait < 0) /* task has freed sub_info */ + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ return 0; wait_for_completion(&done); retval = sub_info->retval; diff --git a/kernel/sys.c b/kernel/sys.c index aeded9ad66ce..18987c7f6add 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2327,7 +2327,7 @@ int orderly_poweroff(bool force) call_usermodehelper_setcleanup(info, argv_cleanup); - ret = call_usermodehelper_exec(info, -1); + ret = call_usermodehelper_exec(info, UMH_NO_WAIT); out: if (ret && force) { diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 12e311dc664c..bd5ecbbafab1 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -208,7 +208,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, argv [0] = uevent_helper; argv [1] = (char *)subsystem; argv [2] = NULL; - call_usermodehelper (argv[0], argv, envp, 0); + call_usermodehelper (argv[0], argv, envp, UMH_WAIT_EXEC); } exit: diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a786e7863200..1ea2f86f7683 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -125,7 +125,7 @@ static void br_stp_start(struct net_bridge *br) char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; char *envp[] = { NULL }; - r = call_usermodehelper(BR_STP_PROG, argv, envp, 1); + r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); if (r == 0) { br->stp_enabled = BR_USER_STP; printk(KERN_INFO "%s: userspace STP started\n", br->dev->name); diff --git a/security/keys/request_key.c b/security/keys/request_key.c index f573ac189a0a..557500110a13 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -108,7 +108,8 @@ static int call_sbin_request_key(struct key *key, argv[i] = NULL; /* do it */ - ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1); + ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, + UMH_WAIT_PROC); error_link: key_put(keyring); -- cgit v1.2.3 From 810bab448e563ffd1718d78e9a3756806b626acc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: use elfnote.h to generate vsyscall notes. Use existing elfnote.h to generate vsyscall notes, rather than doing it locally. Changes elfnote.h a bit to suit, since this is the first asm user, and it wasn't quite right. Signed-off-by: Jeremy Fitzhardinge Cc: "Eric W. Biederman" Cc: Roland McGrath Cc: Andrew Morton --- arch/i386/kernel/vsyscall-note.S | 23 ++++++----------------- include/linux/elfnote.h | 22 +++++++++++++++------- 2 files changed, 21 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S index d4b5be4f3d5f..52e0cbbac70c 100644 --- a/arch/i386/kernel/vsyscall-note.S +++ b/arch/i386/kernel/vsyscall-note.S @@ -3,23 +3,12 @@ * Here we can supply some information useful to userland. */ -#include #include +#include -#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ - .section name, flags; \ - .balign 4; \ - .long 1f - 0f; /* name length */ \ - .long 3f - 2f; /* data length */ \ - .long type; /* note type */ \ -0: .asciz vendor; /* vendor name */ \ -1: .balign 4; \ -2: - -#define ASM_ELF_NOTE_END \ -3: .balign 4; /* pad out section */ \ - .previous - - ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE - ASM_ELF_NOTE_END +ELFNOTE_END diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 9a1e0674e56c..e831759b2fb5 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -38,17 +38,25 @@ * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two") * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef) */ -#define ELFNOTE(name, type, desctype, descdata) \ -.pushsection .note.name, "",@note ; \ - .align 4 ; \ +#define ELFNOTE_START(name, type, flags) \ +.pushsection .note.name, flags,@note ; \ + .balign 4 ; \ .long 2f - 1f /* namesz */ ; \ - .long 4f - 3f /* descsz */ ; \ + .long 4484f - 3f /* descsz */ ; \ .long type ; \ 1:.asciz #name ; \ -2:.align 4 ; \ -3:desctype descdata ; \ -4:.align 4 ; \ +2:.balign 4 ; \ +3: + +#define ELFNOTE_END \ +4484:.balign 4 ; \ .popsection ; + +#define ELFNOTE(name, type, desc) \ + ELFNOTE_START(name, type, "") \ + desc ; \ + ELFNOTE_END + #else /* !__ASSEMBLER__ */ #include /* -- cgit v1.2.3 From 5f4352fbffd6c45123dbce9e195efd54df4e177e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:04 -0700 Subject: Allocate and free vmalloc areas Allocate/release a chunk of vmalloc address space: alloc_vm_area reserves a chunk of address space, and makes sure all the pagetables are constructed for that address range - but no pages. free_vm_area releases the address space range. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ian Pratt Signed-off-by: Christian Limpach Signed-off-by: Chris Wright Cc: "Jan Beulich" Cc: "Andi Kleen" --- include/linux/vmalloc.h | 4 ++++ mm/vmalloc.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 132b260aef1e..c2b10cae5da5 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages); extern void unmap_kernel_range(unsigned long addr, unsigned long size); +/* Allocate/destroy a 'vmalloc' VM area. */ +extern struct vm_struct *alloc_vm_area(size_t size); +extern void free_vm_area(struct vm_struct *area); + /* * Internals. Dont't use.. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8e05a11155c9..3130c343088f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -767,3 +767,56 @@ EXPORT_SYMBOL(remap_vmalloc_range); void __attribute__((weak)) vmalloc_sync_all(void) { } + + +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + /* apply_to_page_range() does all the hard work. */ + return 0; +} + +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * @returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + struct vm_struct *area; + + area = get_vm_area(size, VM_IOREMAP); + if (area == NULL) + return NULL; + + /* + * This ensures that page tables are constructed for this region + * of kernel virtual address space and mapped into init_mm. + */ + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + area->size, f, NULL)) { + free_vm_area(area); + return NULL; + } + + /* Make sure the pagetables are constructed in process kernel + mappings */ + vmalloc_sync_all(); + + return area; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + struct vm_struct *ret; + ret = remove_vm_area(area->addr); + BUG_ON(ret != area); + kfree(area); +} +EXPORT_SYMBOL_GPL(free_vm_area); -- cgit v1.2.3 From c85b04c3749507546f6d5868976e4793e35c2ec0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: add pinned page flag Add a new definition for PG_owner_priv_1 to define PG_pinned on Xen pagetable pages. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright --- include/linux/page-flags.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ae2d79f2107e..731cd2ac3227 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -92,6 +92,7 @@ /* PG_owner_priv_1 users should have descriptive aliases */ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ +#define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */ #if (BITS_PER_LONG > 32) /* @@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page) #define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) #define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags) +#define PagePinned(page) test_bit(PG_pinned, &(page)->flags) +#define SetPagePinned(page) set_bit(PG_pinned, &(page)->flags) +#define ClearPagePinned(page) clear_bit(PG_pinned, &(page)->flags) + #define PageReserved(page) test_bit(PG_reserved, &(page)->flags) #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) -- cgit v1.2.3 From 9f27ee595038653ddf8bca871200d39247d6f4fc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:06 -0700 Subject: xen: add virtual block device driver. The block device frontend driver allows the kernel to access block devices exported exported by a virtual machine containing a physical block device driver. Signed-off-by: Ian Pratt Signed-off-by: Christian Limpach Signed-off-by: Chris Wright Cc: Arjan van de Ven Cc: Greg KH Cc: Jens Axboe --- drivers/block/Kconfig | 9 + drivers/block/Makefile | 1 + drivers/block/xen-blkfront.c | 988 +++++++++++++++++++++++++++++++++++++++++++ include/linux/major.h | 2 + 4 files changed, 1000 insertions(+) create mode 100644 drivers/block/xen-blkfront.c (limited to 'include/linux') diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 8f65b88cf711..a4a311992408 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -427,4 +427,13 @@ config XILINX_SYSACE help Include support for the Xilinx SystemACE CompactFlash interface +config XEN_BLKDEV_FRONTEND + tristate "Xen virtual block device support" + depends on XEN + default y + help + This driver implements the front-end of the Xen virtual + block device driver. It communicates with a back-end driver + in another domain which drives the actual block device. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 9ee08ab4ffa8..3e31532df0ed 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o obj-$(CONFIG_BLK_DEV_UB) += ub.o +obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c new file mode 100644 index 000000000000..6746c29181f8 --- /dev/null +++ b/drivers/block/xen-blkfront.c @@ -0,0 +1,988 @@ +/* + * blkfront.c + * + * XenLinux virtual block device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004, Christian Limpach + * Copyright (c) 2004, Andrew Warfield + * Copyright (c) 2005, Christopher Clark + * Copyright (c) 2005, XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include + +enum blkif_state { + BLKIF_STATE_DISCONNECTED, + BLKIF_STATE_CONNECTED, + BLKIF_STATE_SUSPENDED, +}; + +struct blk_shadow { + struct blkif_request req; + unsigned long request; + unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; + +static struct block_device_operations xlvbd_block_fops; + +#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. They + * hang in private_data off the gendisk structure. We may end up + * putting all kinds of interesting stuff here :-) + */ +struct blkfront_info +{ + struct xenbus_device *xbdev; + dev_t dev; + struct gendisk *gd; + int vdevice; + blkif_vdev_t handle; + enum blkif_state connected; + int ring_ref; + struct blkif_front_ring ring; + unsigned int evtchn, irq; + struct request_queue *rq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_RING_SIZE]; + unsigned long shadow_free; + int feature_barrier; + + /** + * The number of people holding this device open. We won't allow a + * hot-unplug unless this is 0. + */ + int users; +}; + +static DEFINE_SPINLOCK(blkif_io_lock); + +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) +#define GRANT_INVALID_REF 0 + +#define PARTS_PER_DISK 16 + +#define BLKIF_MAJOR(dev) ((dev)>>8) +#define BLKIF_MINOR(dev) ((dev) & 0xff) + +#define DEV_NAME "xvd" /* name in /dev */ + +/* Information about our VBDs. */ +#define MAX_VBDS 64 +static LIST_HEAD(vbds_list); + +static int get_id_from_freelist(struct blkfront_info *info) +{ + unsigned long free = info->shadow_free; + BUG_ON(free > BLK_RING_SIZE); + info->shadow_free = info->shadow[free].req.id; + info->shadow[free].req.id = 0x0fffffee; /* debug */ + return free; +} + +static void add_id_to_freelist(struct blkfront_info *info, + unsigned long id) +{ + info->shadow[id].req.id = info->shadow_free; + info->shadow[id].request = 0; + info->shadow_free = id; +} + +static void blkif_restart_queue_callback(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + schedule_work(&info->work); +} + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + unsigned long buffer_mfn; + struct blkif_request *ring_req; + struct bio *bio; + struct bio_vec *bvec; + int idx; + unsigned long id; + unsigned int fsect, lsect; + int ref; + grant_ref_t gref_head; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (gnttab_alloc_grant_references( + BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + return 1; + } + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + id = get_id_from_freelist(info); + info->shadow[id].request = (unsigned long)req; + + ring_req->id = id; + ring_req->sector_number = (blkif_sector_t)req->sector; + ring_req->handle = info->handle; + + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (blk_barrier_rq(req)) + ring_req->operation = BLKIF_OP_WRITE_BARRIER; + + ring_req->nr_segments = 0; + rq_for_each_bio (bio, req) { + bio_for_each_segment (bvec, bio, idx) { + BUG_ON(ring_req->nr_segments + == BLKIF_MAX_SEGMENTS_PER_REQUEST); + buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page)); + fsect = bvec->bv_offset >> 9; + lsect = fsect + (bvec->bv_len >> 9) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + gnttab_grant_foreign_access_ref( + ref, + info->xbdev->otherend_id, + buffer_mfn, + rq_data_dir(req) ); + + info->shadow[id].frame[ring_req->nr_segments] = + mfn_to_pfn(buffer_mfn); + + ring_req->seg[ring_req->nr_segments] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + + ring_req->nr_segments++; + } + } + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + gnttab_free_grant_references(gref_head); + + return 0; +} + + +static inline void flush_requests(struct blkfront_info *info) +{ + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + + if (notify) + notify_remote_via_irq(info->irq); +} + +/* + * do_blkif_request + * read a block; request is in a request queue + */ +static void do_blkif_request(request_queue_t *rq) +{ + struct blkfront_info *info = NULL; + struct request *req; + int queued; + + pr_debug("Entered do_blkif_request\n"); + + queued = 0; + + while ((req = elv_next_request(rq)) != NULL) { + info = req->rq_disk->private_data; + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + + if (RING_FULL(&info->ring)) + goto wait; + + pr_debug("do_blk_req %p: cmd %p, sec %lx, " + "(%u/%li) buffer:%p [%s]\n", + req, req->cmd, (unsigned long)req->sector, + req->current_nr_sectors, + req->nr_sectors, req->buffer, + rq_data_dir(req) ? "write" : "read"); + + + blkdev_dequeue_request(req); + if (blkif_queue_request(req)) { + blk_requeue_request(rq, req); +wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + break; + } + + queued++; + } + + if (queued != 0) + flush_requests(info); +} + +static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) +{ + request_queue_t *rq; + + rq = blk_init_queue(do_blkif_request, &blkif_io_lock); + if (rq == NULL) + return -1; + + elevator_init(rq, "noop"); + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_hardsect_size(rq, sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + gd->queue = rq; + + return 0; +} + + +static int xlvbd_barrier(struct blkfront_info *info) +{ + int err; + + err = blk_queue_ordered(info->rq, + info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, + NULL); + + if (err) + return err; + + printk(KERN_INFO "blkfront: %s: barriers %s\n", + info->gd->disk_name, + info->feature_barrier ? "enabled" : "disabled"); + return 0; +} + + +static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, + int vdevice, u16 vdisk_info, u16 sector_size, + struct blkfront_info *info) +{ + struct gendisk *gd; + int nr_minors = 1; + int err = -ENODEV; + + BUG_ON(info->gd != NULL); + BUG_ON(info->rq != NULL); + + if ((minor % PARTS_PER_DISK) == 0) + nr_minors = PARTS_PER_DISK; + + gd = alloc_disk(nr_minors); + if (gd == NULL) + goto out; + + if (nr_minors > 1) + sprintf(gd->disk_name, "%s%c", DEV_NAME, + 'a' + minor / PARTS_PER_DISK); + else + sprintf(gd->disk_name, "%s%c%d", DEV_NAME, + 'a' + minor / PARTS_PER_DISK, + minor % PARTS_PER_DISK); + + gd->major = XENVBD_MAJOR; + gd->first_minor = minor; + gd->fops = &xlvbd_block_fops; + gd->private_data = info; + gd->driverfs_dev = &(info->xbdev->dev); + set_capacity(gd, capacity); + + if (xlvbd_init_blk_queue(gd, sector_size)) { + del_gendisk(gd); + goto out; + } + + info->rq = gd->queue; + info->gd = gd; + + if (info->feature_barrier) + xlvbd_barrier(info); + + if (vdisk_info & VDISK_READONLY) + set_disk_ro(gd, 1); + + if (vdisk_info & VDISK_REMOVABLE) + gd->flags |= GENHD_FL_REMOVABLE; + + if (vdisk_info & VDISK_CDROM) + gd->flags |= GENHD_FL_CD; + + return 0; + + out: + return err; +} + +static void kick_pending_request_queues(struct blkfront_info *info) +{ + if (!RING_FULL(&info->ring)) { + /* Re-enable calldowns. */ + blk_start_queue(info->rq); + /* Kick things off immediately. */ + do_blkif_request(info->rq); + } +} + +static void blkif_restart_queue(struct work_struct *work) +{ + struct blkfront_info *info = container_of(work, struct blkfront_info, work); + + spin_lock_irq(&blkif_io_lock); + if (info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); +} + +static void blkif_free(struct blkfront_info *info, int suspend) +{ + /* Prevent new requests being issued until we fix things up. */ + spin_lock_irq(&blkif_io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irq(&blkif_io_lock); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + + /* Free resources associated with old device channel. */ + if (info->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + info->ring_ref = GRANT_INVALID_REF; + info->ring.sring = NULL; + } + if (info->irq) + unbind_from_irqhandler(info->irq, info); + info->evtchn = info->irq = 0; + +} + +static void blkif_completion(struct blk_shadow *s) +{ + int i; + for (i = 0; i < s->req.nr_segments; i++) + gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); +} + +static irqreturn_t blkif_interrupt(int irq, void *dev_id) +{ + struct request *req; + struct blkif_response *bret; + RING_IDX i, rp; + unsigned long flags; + struct blkfront_info *info = (struct blkfront_info *)dev_id; + int uptodate; + + spin_lock_irqsave(&blkif_io_lock, flags); + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + spin_unlock_irqrestore(&blkif_io_lock, flags); + return IRQ_HANDLED; + } + + again: + rp = info->ring.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = info->ring.rsp_cons; i != rp; i++) { + unsigned long id; + int ret; + + bret = RING_GET_RESPONSE(&info->ring, i); + id = bret->id; + req = (struct request *)info->shadow[id].request; + + blkif_completion(&info->shadow[id]); + + add_id_to_freelist(info, id); + + uptodate = (bret->status == BLKIF_RSP_OKAY); + switch (bret->operation) { + case BLKIF_OP_WRITE_BARRIER: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", + info->gd->disk_name); + uptodate = -EOPNOTSUPP; + info->feature_barrier = 0; + xlvbd_barrier(info); + } + /* fall through */ + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if (unlikely(bret->status != BLKIF_RSP_OKAY)) + dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " + "request: %x\n", bret->status); + + ret = end_that_request_first(req, uptodate, + req->hard_nr_sectors); + BUG_ON(ret); + end_that_request_last(req, uptodate); + break; + default: + BUG(); + } + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) { + int more_to_do; + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + if (more_to_do) + goto again; + } else + info->ring.sring->rsp_event = i + 1; + + kick_pending_request_queues(info); + + spin_unlock_irqrestore(&blkif_io_lock, flags); + + return IRQ_HANDLED; +} + + +static int setup_blkring(struct xenbus_device *dev, + struct blkfront_info *info) +{ + struct blkif_sring *sring; + int err; + + info->ring_ref = GRANT_INVALID_REF; + + sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); + if (err < 0) { + free_page((unsigned long)sring); + info->ring.sring = NULL; + goto fail; + } + info->ring_ref = err; + + err = xenbus_alloc_evtchn(dev, &info->evtchn); + if (err) + goto fail; + + err = bind_evtchn_to_irqhandler(info->evtchn, + blkif_interrupt, + IRQF_SAMPLE_RANDOM, "blkif", info); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_evtchn_to_irqhandler failed"); + goto fail; + } + info->irq = err; + + return 0; +fail: + blkif_free(info, 0); + return err; +} + + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_backend(struct xenbus_device *dev, + struct blkfront_info *info) +{ + const char *message = NULL; + struct xenbus_transaction xbt; + int err; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, info); + if (err) + goto out; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_blkring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", info->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, + "event-channel", "%u", info->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_blkring; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(dev, err, "%s", message); + destroy_blkring: + blkif_free(info, 0); + out: + return err; +} + + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffer for communication with the backend, and + * inform the backend of the appropriate details for those. Switch to + * Initialised state. + */ +static int blkfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err, vdevice, i; + struct blkfront_info *info; + + /* FIXME: Use dynamic device id if this is not set. */ + err = xenbus_scanf(XBT_NIL, dev->nodename, + "virtual-device", "%i", &vdevice); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading virtual-device"); + return err; + } + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); + return -ENOMEM; + } + + info->xbdev = dev; + info->vdevice = vdevice; + info->connected = BLKIF_STATE_DISCONNECTED; + INIT_WORK(&info->work, blkif_restart_queue); + + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Front end dir is a number, which is used as the id. */ + info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); + dev->dev.driver_data = info; + + err = talk_to_backend(dev, info); + if (err) { + kfree(info); + dev->dev.driver_data = NULL; + return err; + } + + return 0; +} + + +static int blkif_recover(struct blkfront_info *info) +{ + int i; + struct blkif_request *req; + struct blk_shadow *copy; + int j; + + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmalloc(sizeof(info->shadow), GFP_KERNEL); + if (!copy) + return -ENOMEM; + memcpy(copy, info->shadow, sizeof(info->shadow)); + + /* Stage 2: Set up free list. */ + memset(&info->shadow, 0, sizeof(info->shadow)); + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow_free = info->ring.req_prod_pvt; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Stage 3: Find pending requests and requeue them. */ + for (i = 0; i < BLK_RING_SIZE; i++) { + /* Not in use? */ + if (copy[i].request == 0) + continue; + + /* Grab a request slot and copy shadow state into it. */ + req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + *req = copy[i].req; + + /* We get a new request id, and must reset the shadow state. */ + req->id = get_id_from_freelist(info); + memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); + + /* Rewrite any grant references invalidated by susp/resume. */ + for (j = 0; j < req->nr_segments; j++) + gnttab_grant_foreign_access_ref( + req->seg[j].gref, + info->xbdev->otherend_id, + pfn_to_mfn(info->shadow[req->id].frame[j]), + rq_data_dir( + (struct request *) + info->shadow[req->id].request)); + info->shadow[req->id].req = *req; + + info->ring.req_prod_pvt++; + } + + kfree(copy); + + xenbus_switch_state(info->xbdev, XenbusStateConnected); + + spin_lock_irq(&blkif_io_lock); + + /* Now safe for us to use the shared ring */ + info->connected = BLKIF_STATE_CONNECTED; + + /* Send off requeued requests */ + flush_requests(info); + + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(info); + + spin_unlock_irq(&blkif_io_lock); + + return 0; +} + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our blkif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int blkfront_resume(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + int err; + + dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); + + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + + err = talk_to_backend(dev, info); + if (info->connected == BLKIF_STATE_SUSPENDED && !err) + err = blkif_recover(info); + + return err; +} + + +/* + * Invoked when the backend is finally 'ready' (and has told produced + * the details about the physical device - #sectors, size, etc). + */ +static void blkfront_connect(struct blkfront_info *info) +{ + unsigned long long sectors; + unsigned long sector_size; + unsigned int binfo; + int err; + + if ((info->connected == BLKIF_STATE_CONNECTED) || + (info->connected == BLKIF_STATE_SUSPENDED) ) + return; + + dev_dbg(&info->xbdev->dev, "%s:%s.\n", + __func__, info->xbdev->otherend); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "sectors", "%llu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); + if (err) { + xenbus_dev_fatal(info->xbdev, err, + "reading backend fields at %s", + info->xbdev->otherend); + return; + } + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%lu", &info->feature_barrier, + NULL); + if (err) + info->feature_barrier = 0; + + err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice), + sectors, info->vdevice, + binfo, sector_size, info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", + info->xbdev->otherend); + return; + } + + xenbus_switch_state(info->xbdev, XenbusStateConnected); + + /* Kick pending requests. */ + spin_lock_irq(&blkif_io_lock); + info->connected = BLKIF_STATE_CONNECTED; + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); + + add_disk(info->gd); +} + +/** + * Handle the change of state of the backend to Closing. We must delete our + * device-layer structures now, to ensure that writes are flushed through to + * the backend. Once is this done, we can switch to Closed in + * acknowledgement. + */ +static void blkfront_closing(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + unsigned long flags; + + dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename); + + if (info->rq == NULL) + goto out; + + spin_lock_irqsave(&blkif_io_lock, flags); + + del_gendisk(info->gd); + + /* No more blkif_request(). */ + blk_stop_queue(info->rq); + + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irqrestore(&blkif_io_lock, flags); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + + blk_cleanup_queue(info->rq); + info->rq = NULL; + + out: + xenbus_frontend_closed(dev); +} + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct blkfront_info *info = dev->dev.driver_data; + struct block_device *bd; + + dev_dbg(&dev->dev, "blkfront:backend_changed.\n"); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + + case XenbusStateConnected: + blkfront_connect(info); + break; + + case XenbusStateClosing: + bd = bdget(info->dev); + if (bd == NULL) + xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); + + mutex_lock(&bd->bd_mutex); + if (info->users > 0) + xenbus_dev_error(dev, -EBUSY, + "Device in use; refusing to close"); + else + blkfront_closing(dev); + mutex_unlock(&bd->bd_mutex); + bdput(bd); + break; + } +} + +static int blkfront_remove(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + + dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename); + + blkif_free(info, 0); + + kfree(info); + + return 0; +} + +static int blkif_open(struct inode *inode, struct file *filep) +{ + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + info->users++; + return 0; +} + +static int blkif_release(struct inode *inode, struct file *filep) +{ + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + info->users--; + if (info->users == 0) { + /* Check whether we have been instructed to close. We will + have ignored this request initially, as the device was + still mounted. */ + struct xenbus_device *dev = info->xbdev; + enum xenbus_state state = xenbus_read_driver_state(dev->otherend); + + if (state == XenbusStateClosing) + blkfront_closing(dev); + } + return 0; +} + +static struct block_device_operations xlvbd_block_fops = +{ + .owner = THIS_MODULE, + .open = blkif_open, + .release = blkif_release, +}; + + +static struct xenbus_device_id blkfront_ids[] = { + { "vbd" }, + { "" } +}; + +static struct xenbus_driver blkfront = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkfront_ids, + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, + .otherend_changed = backend_changed, +}; + +static int __init xlblk_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { + printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", + XENVBD_MAJOR, DEV_NAME); + return -ENODEV; + } + + return xenbus_register_frontend(&blkfront); +} +module_init(xlblk_init); + + +static void xlblk_exit(void) +{ + return xenbus_unregister_driver(&blkfront); +} +module_exit(xlblk_exit); + +MODULE_DESCRIPTION("Xen virtual block device frontend"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); diff --git a/include/linux/major.h b/include/linux/major.h index 7e7c9093919a..0cb98053537a 100644 --- a/include/linux/major.h +++ b/include/linux/major.h @@ -158,6 +158,8 @@ #define VXSPEC_MAJOR 200 /* VERITAS volume config driver */ #define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */ +#define XENVBD_MAJOR 202 /* Xen virtual block device */ + #define MSR_MAJOR 202 #define CPUID_MAJOR 203 -- cgit v1.2.3