From d02f00cc057809d96c044cc72d5b9809d59f7d49 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 7 Dec 2009 13:10:48 -0800 Subject: ocfs2: allocation reservations This patch improves Ocfs2 allocation policy by allowing an inode to reserve a portion of the local alloc bitmap for itself. The reserved portion (allocation window) is advisory in that other allocation windows might steal it if the local alloc bitmap becomes full. Otherwise, the reservations are honored and guaranteed to be free. When the local alloc window is moved to a different portion of the bitmap, existing reservations are discarded. Reservation windows are represented internally by a red-black tree. Within that tree, each node represents the reservation window of one inode. An LRU of active reservations is also maintained. When new data is written, we allocate it from the inodes window. When all bits in a window are exhausted, we allocate a new one as close to the previous one as possible. Should we not find free space, an existing reservation is pulled off the LRU and cannibalized. Signed-off-by: Mark Fasheh --- Documentation/filesystems/ocfs2.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index c58b9f5ba002..412df9095937 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -80,3 +80,6 @@ user_xattr (*) Enables Extended User Attributes. nouser_xattr Disables Extended User Attributes. acl Enables POSIX Access Control Lists support. noacl (*) Disables POSIX Access Control Lists support. +resv_level=4 (*) Set how agressive allocation reservations will be. + Valid values are between 0 (reservations off) to 8 + (maximum space for reservations). -- cgit v1.2.3 From b07f8f24dfe54da0f074b78949044842e8df881f Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 5 Apr 2010 18:17:15 -0700 Subject: ocfs2: change default reservation window sizes The default reservation size of 4 (32-bit windows) is a bit too ambitious. Scale it back to 16 bits (resv_level=2). I have been testing various sizes on a 4-node cluster which runs a mixed workload that is heavily threaded. With a 256MB local alloc, I get *roughly* the following levels of average file fragmentation: resv_level=0 70% resv_level=1 21% resv_level=2 23% resv_level=3 24% resv_level=4 60% resv_level=5 did not test resv_level=6 60% resv_level=2 seemed like a good compromise between not letting windows be too small, but not so big that heavier workloads will immediately suffer without tuning. This patch also change the behavior of directory reservations - they now track file reservations. The previous compromise of giving directory windows only 8 bits wound up fragmenting more at some window sizes because file allocations had smaller unused windows to poach from. Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- Documentation/filesystems/ocfs2.txt | 2 +- fs/ocfs2/reservations.c | 7 ++++--- fs/ocfs2/reservations.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 412df9095937..32339e584a9a 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -80,6 +80,6 @@ user_xattr (*) Enables Extended User Attributes. nouser_xattr Disables Extended User Attributes. acl Enables POSIX Access Control Lists support. noacl (*) Disables POSIX Access Control Lists support. -resv_level=4 (*) Set how agressive allocation reservations will be. +resv_level=2 (*) Set how agressive allocation reservations will be. Valid values are between 0 (reservations off) to 8 (maximum space for reservations). diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 7fc6cfee95f1..87fa35791f18 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -55,9 +55,10 @@ static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) { /* 8, 16, 32, 64, 128, 256, 512, 1024 */ bits = 4 << osb->osb_resv_level; - } else - bits = OCFS2_RESV_DIR_WINDOW_BITS; - + } else { + /* For now, treat directories the same as files. */ + bits = 4 << osb->osb_resv_level; + } return bits; } diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 34bb308375c5..022aff601e15 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -22,7 +22,7 @@ #include -#define OCFS2_DEFAULT_RESV_LEVEL 4 +#define OCFS2_DEFAULT_RESV_LEVEL 2 #define OCFS2_MAX_RESV_LEVEL 9 #define OCFS2_MIN_RESV_LEVEL 0 -- cgit v1.2.3 From 83f92318fa33cc084e14e64dc903e605f75884c1 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 5 Apr 2010 18:17:16 -0700 Subject: ocfs2: Add dir_resv_level mount option The default behavior for directory reservations stays the same, but we add a mount option so people can tweak the size of directory reservations according to their workloads. Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- Documentation/filesystems/ocfs2.txt | 4 ++++ fs/ocfs2/dir.c | 6 ++++-- fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/reservations.c | 9 ++++++--- fs/ocfs2/reservations.h | 2 ++ fs/ocfs2/super.c | 23 +++++++++++++++++++++++ 6 files changed, 40 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 32339e584a9a..1f7ae144f6d8 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -83,3 +83,7 @@ noacl (*) Disables POSIX Access Control Lists support. resv_level=2 (*) Set how agressive allocation reservations will be. Valid values are between 0 (reservations off) to 8 (maximum space for reservations). +dir_resv_level= (*) By default, directory reservations will scale with file + reservations - users should rarely need to change this + value. If allocation reservations are turned off, this + option will have no effect. diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8563f97c58af..6c9a28a2d3ae 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2977,7 +2977,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * if we only get one now, that's enough to continue. The rest * will be claimed after the conversion to extents. */ - data_ac->ac_resv = &oi->ip_la_data_resv; + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &oi->ip_la_data_resv; ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); @@ -3348,7 +3349,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, goto bail; } - data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; credits = ocfs2_calc_extend_credits(sb, el, 1); } else { diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 09d7aee3dabe..a388528f485c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -356,6 +356,7 @@ struct ocfs2_super struct ocfs2_reservation_map osb_la_resmap; unsigned int osb_resv_level; + unsigned int osb_dir_resv_level; /* Next three fields are for local node slot recovery during * mount. */ diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 87fa35791f18..6497bcc00fa5 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -44,7 +44,11 @@ DEFINE_SPINLOCK(resv_lock); #define OCFS2_MIN_RESV_WINDOW_BITS 8 #define OCFS2_MAX_RESV_WINDOW_BITS 1024 -#define OCFS2_RESV_DIR_WINDOW_BITS OCFS2_MIN_RESV_WINDOW_BITS + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) +{ + return (osb->osb_resv_level && osb->osb_dir_resv_level); +} static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv) @@ -56,8 +60,7 @@ static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, /* 8, 16, 32, 64, 128, 256, 512, 1024 */ bits = 4 << osb->osb_resv_level; } else { - /* For now, treat directories the same as files. */ - bits = 4 << osb->osb_resv_level; + bits = 4 << osb->osb_dir_resv_level; } return bits; } diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 022aff601e15..25b0c0e31e91 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -67,6 +67,8 @@ void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv); void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, unsigned int flags); +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb); + /** * ocfs2_resv_discard() - truncate a reservation * @resmap: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5745682eb1c0..79d7d4cf45b1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -96,6 +96,7 @@ struct mount_options signed short slot; int localalloc_opt; unsigned int resv_level; + int dir_resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; }; @@ -178,6 +179,7 @@ enum { Opt_usrquota, Opt_grpquota, Opt_resv_level, + Opt_dir_resv_level, Opt_err, }; @@ -205,6 +207,7 @@ static const match_table_t tokens = { {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, {Opt_resv_level, "resv_level=%u"}, + {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_err, NULL} }; @@ -1034,6 +1037,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); osb->osb_resv_level = parsed_options.resv_level; + osb->osb_dir_resv_level = parsed_options.resv_level; + if (parsed_options.dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options.resv_level; + else + osb->osb_dir_resv_level = parsed_options.dir_resv_level; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1295,6 +1303,7 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->localalloc_opt = -1; mopt->cluster_stack[0] = '\0'; mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; if (!options) { status = 1; @@ -1449,6 +1458,17 @@ static int ocfs2_parse_options(struct super_block *sb, option < OCFS2_MAX_RESV_LEVEL) mopt->resv_level = option; break; + case Opt_dir_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = option; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1533,6 +1553,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) seq_printf(s, ",resv_level=%d", osb->osb_resv_level); + if (osb->osb_dir_resv_level != osb->osb_resv_level) + seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + return 0; } -- cgit v1.2.3