// SPDX-License-Identifier: GPL-2.0-only /* * Historical Service Time * * Keeps a time-weighted exponential moving average of the historical * service time. Estimates future service time based on the historical * service time and the number of outstanding requests. * * Marks paths stale if they have not finished within hst * * num_paths. If a path is stale and unused, we will send a single * request to probe in case the path has improved. This situation * generally arises if the path is so much worse than others that it * will never have the best estimated service time, or if the entire * multipath device is unused. If a path is stale and in use, limit the * number of requests it can receive with the assumption that the path * has become degraded. * * To avoid repeatedly calculating exponents for time weighting, times * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) * ns, and the weighting is pre-calculated. * */ #include "dm.h" #include "dm-path-selector.h" #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/module.h> #define DM_MSG_PREFIX "multipath historical-service-time" #define HST_MIN_IO 1 #define HST_VERSION "0.1.1" #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) #define HST_FIXED_1 (1 << HST_FIXED_SHIFT) #define HST_FIXED_95 972 #define HST_MAX_INFLIGHT HST_FIXED_1 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ #define HST_WEIGHT_COUNT 64ULL struct selector { struct list_head valid_paths; struct list_head failed_paths; int valid_count; spinlock_t lock; unsigned int weights[HST_WEIGHT_COUNT]; unsigned int threshold_multiplier; }; struct path_info { struct list_head list; struct dm_path *path; unsigned int repeat_count; spinlock_t lock; u64 historical_service_time; /* Fixed point */ u64 stale_after; u64 last_finish; u64 outstanding; }; /** * fixed_power - compute: x^n, in O(log n) time * * @x: base of the power * @frac_bits: fractional bits of @x * @n: power to raise @x to. * * By exploiting the relation between the definition of the natural power * function: x^n := x*x*...*x (x multiplied by itself for n times), and * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, * (where: n_i \elem {0, 1}, the binary vector representing n), * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is * of course trivially computable in O(log_2 n), the length of our binary * vector. * * (see: kernel/sched/loadavg.c) */ static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) { unsigned long result = 1UL << frac_bits; if (n) { for (;;) { if (n & 1) { result *= x; result += 1UL << (frac_bits - 1); result >>= frac_bits; } n >>= 1; if (!n) break; x *= x; x += 1UL << (frac_bits - 1); x >>= frac_bits; } } return result; } /* * Calculate the next value of an exponential moving average * a_1 = a_0 * e + a * (1 - e) * * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] * @weight: [0, HST_FIXED_1] * * Note: * To account for multiple periods in the same calculation, * a_n = a_0 * e^n + a * (1 - e^n), * so call fixed_ema(last, next, pow(weight, N)) */ static u64 fixed_ema(u64 last, u64 next, u64 weight) { last *= weight; last += next * (HST_FIXED_1 - weight); last += 1ULL << (HST_FIXED_SHIFT - 1); return last >> HST_FIXED_SHIFT; } static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); if (s) { INIT_LIST_HEAD(&s->valid_paths); INIT_LIST_HEAD(&s->failed_paths); spin_lock_init(&s->lock); s->valid_count = 0; } return s; } /* * Get the weight for a given time span. */ static u64 hst_weight(struct path_selector *ps, u64 delta) { struct selector *s = ps->context; int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, HST_WEIGHT_COUNT - 1); return s->weights[bucket]; } /* * Set up the weights array. * * weights[len-1] = 0 * weights[n] = base ^ (n + 1) */ static void hst_set_weights(struct path_selector *ps, unsigned int base) { struct selector *s = ps->context; int i; if (base >= HST_FIXED_1) return; for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); s->weights[HST_WEIGHT_COUNT - 1] = 0; } static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) { struct selector *s; unsigned int base_weight = HST_FIXED_95; unsigned int threshold_multiplier = 0; char dummy; /* * Arguments: [<base_weight> [<threshold_multiplier>]] * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A * value of 0 will completely ignore any history. * If not given, default (HST_FIXED_95) is used. * <threshold_multiplier>: Minimum threshold multiplier for paths to * be considered different. That is, a path is * considered different iff (p1 > N * p2) where p1 * is the path with higher service time. A threshold * of 1 or 0 has no effect. Defaults to 0. */ if (argc > 2) return -EINVAL; if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || base_weight >= HST_FIXED_1)) { return -EINVAL; } if (argc > 1 && (sscanf(argv[1], "%u%c", &threshold_multiplier, &dummy) != 1)) { return -EINVAL; } s = alloc_selector(); if (!s) return -ENOMEM; ps->context = s; hst_set_weights(ps, base_weight); s->threshold_multiplier = threshold_multiplier; return 0; } static void free_paths(struct list_head *paths) { struct path_info *pi, *next; list_for_each_entry_safe(pi, next, paths, list) { list_del(&pi->list); kfree(pi); } } static void hst_destroy(struct path_selector *ps) { struct selector *s = ps->context; free_paths(&s->valid_paths); free_paths(&s->failed_paths); kfree(s); ps->context = NULL; } static int hst_status(struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen) { unsigned int sz = 0; struct path_info *pi; if (!path) { struct selector *s = ps->context; DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); } else { pi = path->pscontext; switch (type) { case STATUSTYPE_INFO: DMEMIT("%llu %llu %llu ", pi->historical_service_time, pi->outstanding, pi->stale_after); break; case STATUSTYPE_TABLE: DMEMIT("0 "); break; case STATUSTYPE_IMA: *result = '\0'; break; } } return sz; } static int hst_add_path(struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error) { struct selector *s = ps->context; struct path_info *pi; unsigned int repeat_count = HST_MIN_IO; char dummy; unsigned long flags; /* * Arguments: [<repeat_count>] * <repeat_count>: The number of I/Os before switching path. * If not given, default (HST_MIN_IO) is used. */ if (argc > 1) { *error = "historical-service-time ps: incorrect number of arguments"; return -EINVAL; } if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { *error = "historical-service-time ps: invalid repeat count"; return -EINVAL; } /* allocate the path */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { *error = "historical-service-time ps: Error allocating path context"; return -ENOMEM; } pi->path = path; pi->repeat_count = repeat_count; pi->historical_service_time = HST_FIXED_1; spin_lock_init(&pi->lock); pi->outstanding = 0; pi->stale_after = 0; pi->last_finish = 0; path->pscontext = pi; spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); s->valid_count++; spin_unlock_irqrestore(&s->lock, flags); return 0; } static void hst_fail_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; unsigned long flags; spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->failed_paths); s->valid_count--; spin_unlock_irqrestore(&s->lock, flags); } static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; unsigned long flags; spin_lock_irqsave(&s->lock, flags); list_move_tail(&pi->list, &s->valid_paths); s->valid_count++; spin_unlock_irqrestore(&s->lock, flags); return 0; } static void hst_fill_compare(struct path_info *pi, u64 *hst, u64 *out, u64 *stale) { unsigned long flags; spin_lock_irqsave(&pi->lock, flags); *hst = pi->historical_service_time; *out = pi->outstanding; *stale = pi->stale_after; spin_unlock_irqrestore(&pi->lock, flags); } /* * Compare the estimated service time of 2 paths, pi1 and pi2, * for the incoming I/O. * * Returns: * < 0 : pi1 is better * 0 : no difference between pi1 and pi2 * > 0 : pi2 is better * */ static long long hst_compare(struct path_info *pi1, struct path_info *pi2, u64 time_now, struct path_selector *ps) { struct selector *s = ps->context; u64 hst1, hst2; long long out1, out2, stale1, stale2; int pi2_better, over_threshold; hst_fill_compare(pi1, &hst1, &out1, &stale1); hst_fill_compare(pi2, &hst2, &out2, &stale2); /* Check here if estimated latency for two paths are too similar. * If this is the case, we skip extra calculation and just compare * outstanding requests. In this case, any unloaded paths will * be preferred. */ if (hst1 > hst2) over_threshold = hst1 > (s->threshold_multiplier * hst2); else over_threshold = hst2 > (s->threshold_multiplier * hst1); if (!over_threshold) return out1 - out2; /* * If an unloaded path is stale, choose it. If both paths are unloaded, * choose path that is the most stale. * (If one path is loaded, choose the other) */ if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || (!out1 && !out2)) return (!out2 * stale1) - (!out1 * stale2); /* Compare estimated service time. If outstanding is the same, we * don't need to multiply */ if (out1 == out2) { pi2_better = hst1 > hst2; } else { /* Potential overflow with out >= 1024 */ if (unlikely(out1 >= HST_MAX_INFLIGHT || out2 >= HST_MAX_INFLIGHT)) { /* If over 1023 in-flights, we may overflow if hst * is at max. (With this shift we still overflow at * 1048576 in-flights, which is high enough). */ hst1 >>= HST_FIXED_SHIFT; hst2 >>= HST_FIXED_SHIFT; } pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; } /* In the case that the 'winner' is stale, limit to equal usage. */ if (pi2_better) { if (stale2 < time_now) return out1 - out2; return 1; } if (stale1 < time_now) return out1 - out2; return -1; } static struct dm_path *hst_select_path(struct path_selector *ps, size_t nr_bytes) { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; u64 time_now = ktime_get_ns(); struct dm_path *ret = NULL; unsigned long flags; spin_lock_irqsave(&s->lock, flags); if (list_empty(&s->valid_paths)) goto out; list_for_each_entry(pi, &s->valid_paths, list) { if (!best || (hst_compare(pi, best, time_now, ps) < 0)) best = pi; } if (!best) goto out; /* Move last used path to end (least preferred in case of ties) */ list_move_tail(&best->list, &s->valid_paths); ret = best->path; out: spin_unlock_irqrestore(&s->lock, flags); return ret; } static int hst_start_io(struct path_selector *ps, struct dm_path *path, size_t nr_bytes) { struct path_info *pi = path->pscontext; unsigned long flags; spin_lock_irqsave(&pi->lock, flags); pi->outstanding++; spin_unlock_irqrestore(&pi->lock, flags); return 0; } static u64 path_service_time(struct path_info *pi, u64 start_time) { u64 now = ktime_get_ns(); /* if a previous disk request has finished after this IO was * sent to the hardware, pretend the submission happened * serially. */ if (time_after64(pi->last_finish, start_time)) start_time = pi->last_finish; pi->last_finish = now; if (time_before64(now, start_time)) return 0; return now - start_time; } static int hst_end_io(struct path_selector *ps, struct dm_path *path, size_t nr_bytes, u64 start_time) { struct path_info *pi = path->pscontext; struct selector *s = ps->context; unsigned long flags; u64 st; spin_lock_irqsave(&pi->lock, flags); st = path_service_time(pi, start_time); pi->outstanding--; pi->historical_service_time = fixed_ema(pi->historical_service_time, min(st * HST_FIXED_1, HST_FIXED_MAX), hst_weight(ps, st)); /* * On request end, mark path as fresh. If a path hasn't * finished any requests within the fresh period, the estimated * service time is considered too optimistic and we limit the * maximum requests on that path. */ pi->stale_after = pi->last_finish + (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); spin_unlock_irqrestore(&pi->lock, flags); return 0; } static struct path_selector_type hst_ps = { .name = "historical-service-time", .module = THIS_MODULE, .features = DM_PS_USE_HR_TIMER, .table_args = 1, .info_args = 3, .create = hst_create, .destroy = hst_destroy, .status = hst_status, .add_path = hst_add_path, .fail_path = hst_fail_path, .reinstate_path = hst_reinstate_path, .select_path = hst_select_path, .start_io = hst_start_io, .end_io = hst_end_io, }; static int __init dm_hst_init(void) { int r = dm_register_path_selector(&hst_ps); if (r < 0) DMERR("register failed %d", r); DMINFO("version " HST_VERSION " loaded"); return r; } static void __exit dm_hst_exit(void) { int r = dm_unregister_path_selector(&hst_ps); if (r < 0) DMERR("unregister failed %d", r); } module_init(dm_hst_init); module_exit(dm_hst_exit); MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>"); MODULE_LICENSE("GPL");