/*
 *  fs/ext4/extents_status.c
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *	Allison Henderson <achender@linux.vnet.ibm.com>
 *	Hugh Dickins <hughd@google.com>
 *	Zheng Liu <wenqing.lz@taobao.com>
 *
 * Ext4 extents status tree core functions.
 */
#include <linux/rbtree.h>
#include "ext4.h"
#include "extents_status.h"
#include "ext4_extents.h"

#include <trace/events/ext4.h>

/*
 * According to previous discussion in Ext4 Developer Workshop, we
 * will introduce a new structure called io tree to track all extent
 * status in order to solve some problems that we have met
 * (e.g. Reservation space warning), and provide extent-level locking.
 * Delay extent tree is the first step to achieve this goal.  It is
 * original built by Yongqiang Yang.  At that time it is called delay
 * extent tree, whose goal is only track delay extent in memory to
 * simplify the implementation of fiemap and bigalloc, and introduce
 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
 * delay extent tree at the following comment.  But for better
 * understand what it does, it has been rename to extent status tree.
 *
 * Currently the first step has been done.  All delay extents are
 * tracked in the tree.  It maintains the delay extent when a delay
 * allocation is issued, and the delay extent is written out or
 * invalidated.  Therefore the implementation of fiemap and bigalloc
 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
 *
 * The following comment describes the implemenmtation of extent
 * status tree and future works.
 */

/*
 * extents status tree implementation for ext4.
 *
 *
 * ==========================================================================
 * Extents status encompass delayed extents and extent locks
 *
 * 1. Why delayed extent implementation ?
 *
 * Without delayed extent, ext4 identifies a delayed extent by looking
 * up page cache, this has several deficiencies - complicated, buggy,
 * and inefficient code.
 *
 * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
 * to know if a block or a range of blocks are belonged to a delayed
 * extent.
 *
 * Let us have a look at how they do without delayed extents implementation.
 *   --	FIEMAP
 *	FIEMAP looks up page cache to identify delayed allocations from holes.
 *
 *   --	SEEK_HOLE/DATA
 *	SEEK_HOLE/DATA has the same problem as FIEMAP.
 *
 *   --	bigalloc
 *	bigalloc looks up page cache to figure out if a block is
 *	already under delayed allocation or not to determine whether
 *	quota reserving is needed for the cluster.
 *
 *   -- punch hole
 *	punch hole looks up page cache to identify a delayed extent.
 *
 *   --	writeout
 *	Writeout looks up whole page cache to see if a buffer is
 *	mapped, If there are not very many delayed buffers, then it is
 *	time comsuming.
 *
 * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
 * bigalloc and writeout can figure out if a block or a range of
 * blocks is under delayed allocation(belonged to a delayed extent) or
 * not by searching the delayed extent tree.
 *
 *
 * ==========================================================================
 * 2. ext4 delayed extents impelmentation
 *
 *   --	delayed extent
 *	A delayed extent is a range of blocks which are contiguous
 *	logically and under delayed allocation.  Unlike extent in
 *	ext4, delayed extent in ext4 is a in-memory struct, there is
 *	no corresponding on-disk data.  There is no limit on length of
 *	delayed extent, so a delayed extent can contain as many blocks
 *	as they are contiguous logically.
 *
 *   --	delayed extent tree
 *	Every inode has a delayed extent tree and all under delayed
 *	allocation blocks are added to the tree as delayed extents.
 *	Delayed extents in the tree are ordered by logical block no.
 *
 *   --	operations on a delayed extent tree
 *	There are three operations on a delayed extent tree: find next
 *	delayed extent, adding a space(a range of blocks) and removing
 *	a space.
 *
 *   --	race on a delayed extent tree
 *	Delayed extent tree is protected inode->i_es_lock.
 *
 *
 * ==========================================================================
 * 3. performance analysis
 *   --	overhead
 *	1. There is a cache extent for write access, so if writes are
 *	not very random, adding space operaions are in O(1) time.
 *
 *   --	gain
 *	2. Code is much simpler, more readable, more maintainable and
 *	more efficient.
 *
 *
 * ==========================================================================
 * 4. TODO list
 *   -- Track all extent status
 *
 *   -- Improve get block process
 *
 *   -- Extent-level locking
 */

static struct kmem_cache *ext4_es_cachep;

int __init ext4_init_es(void)
{
	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
	if (ext4_es_cachep == NULL)
		return -ENOMEM;
	return 0;
}

void ext4_exit_es(void)
{
	if (ext4_es_cachep)
		kmem_cache_destroy(ext4_es_cachep);
}

void ext4_es_init_tree(struct ext4_es_tree *tree)
{
	tree->root = RB_ROOT;
	tree->cache_es = NULL;
}

#ifdef ES_DEBUG__
static void ext4_es_print_tree(struct inode *inode)
{
	struct ext4_es_tree *tree;
	struct rb_node *node;

	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
	tree = &EXT4_I(inode)->i_es_tree;
	node = rb_first(&tree->root);
	while (node) {
		struct extent_status *es;
		es = rb_entry(node, struct extent_status, rb_node);
		printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
		node = rb_next(node);
	}
	printk(KERN_DEBUG "\n");
}
#else
#define ext4_es_print_tree(inode)
#endif

static inline ext4_lblk_t extent_status_end(struct extent_status *es)
{
	BUG_ON(es->start + es->len < es->start);
	return es->start + es->len - 1;
}

/*
 * search through the tree for an delayed extent with a given offset.  If
 * it can't be found, try to find next extent.
 */
static struct extent_status *__es_tree_search(struct rb_root *root,
					      ext4_lblk_t offset)
{
	struct rb_node *node = root->rb_node;
	struct extent_status *es = NULL;

	while (node) {
		es = rb_entry(node, struct extent_status, rb_node);
		if (offset < es->start)
			node = node->rb_left;
		else if (offset > extent_status_end(es))
			node = node->rb_right;
		else
			return es;
	}

	if (es && offset < es->start)
		return es;

	if (es && offset > extent_status_end(es)) {
		node = rb_next(&es->rb_node);
		return node ? rb_entry(node, struct extent_status, rb_node) :
			      NULL;
	}

	return NULL;
}

/*
 * ext4_es_find_extent: find the 1st delayed extent covering @es->start
 * if it exists, otherwise, the next extent after @es->start.
 *
 * @inode: the inode which owns delayed extents
 * @es: delayed extent that we found
 *
 * Returns the first block of the next extent after es, otherwise
 * EXT_MAX_BLOCKS if no delay extent is found.
 * Delayed extent is returned via @es.
 */
ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
{
	struct ext4_es_tree *tree = NULL;
	struct extent_status *es1 = NULL;
	struct rb_node *node;
	ext4_lblk_t ret = EXT_MAX_BLOCKS;

	trace_ext4_es_find_extent_enter(inode, es->start);

	read_lock(&EXT4_I(inode)->i_es_lock);
	tree = &EXT4_I(inode)->i_es_tree;

	/* find delay extent in cache firstly */
	if (tree->cache_es) {
		es1 = tree->cache_es;
		if (in_range(es->start, es1->start, es1->len)) {
			es_debug("%u cached by [%u/%u)\n",
				 es->start, es1->start, es1->len);
			goto out;
		}
	}

	es->len = 0;
	es1 = __es_tree_search(&tree->root, es->start);

out:
	if (es1) {
		tree->cache_es = es1;
		es->start = es1->start;
		es->len = es1->len;
		node = rb_next(&es1->rb_node);
		if (node) {
			es1 = rb_entry(node, struct extent_status, rb_node);
			ret = es1->start;
		}
	}

	read_unlock(&EXT4_I(inode)->i_es_lock);

	trace_ext4_es_find_extent_exit(inode, es, ret);
	return ret;
}

static struct extent_status *
ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
{
	struct extent_status *es;
	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
	if (es == NULL)
		return NULL;
	es->start = start;
	es->len = len;
	return es;
}

static void ext4_es_free_extent(struct extent_status *es)
{
	kmem_cache_free(ext4_es_cachep, es);
}

static struct extent_status *
ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
{
	struct extent_status *es1;
	struct rb_node *node;

	node = rb_prev(&es->rb_node);
	if (!node)
		return es;

	es1 = rb_entry(node, struct extent_status, rb_node);
	if (es->start == extent_status_end(es1) + 1) {
		es1->len += es->len;
		rb_erase(&es->rb_node, &tree->root);
		ext4_es_free_extent(es);
		es = es1;
	}

	return es;
}

static struct extent_status *
ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
{
	struct extent_status *es1;
	struct rb_node *node;

	node = rb_next(&es->rb_node);
	if (!node)
		return es;

	es1 = rb_entry(node, struct extent_status, rb_node);
	if (es1->start == extent_status_end(es) + 1) {
		es->len += es1->len;
		rb_erase(node, &tree->root);
		ext4_es_free_extent(es1);
	}

	return es;
}

static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
			      ext4_lblk_t len)
{
	struct rb_node **p = &tree->root.rb_node;
	struct rb_node *parent = NULL;
	struct extent_status *es;
	ext4_lblk_t end = offset + len - 1;

	BUG_ON(end < offset);
	es = tree->cache_es;
	if (es && offset == (extent_status_end(es) + 1)) {
		es_debug("cached by [%u/%u)\n", es->start, es->len);
		es->len += len;
		es = ext4_es_try_to_merge_right(tree, es);
		goto out;
	} else if (es && es->start == end + 1) {
		es_debug("cached by [%u/%u)\n", es->start, es->len);
		es->start = offset;
		es->len += len;
		es = ext4_es_try_to_merge_left(tree, es);
		goto out;
	} else if (es && es->start <= offset &&
		   end <= extent_status_end(es)) {
		es_debug("cached by [%u/%u)\n", es->start, es->len);
		goto out;
	}

	while (*p) {
		parent = *p;
		es = rb_entry(parent, struct extent_status, rb_node);

		if (offset < es->start) {
			if (es->start == end + 1) {
				es->start = offset;
				es->len += len;
				es = ext4_es_try_to_merge_left(tree, es);
				goto out;
			}
			p = &(*p)->rb_left;
		} else if (offset > extent_status_end(es)) {
			if (offset == extent_status_end(es) + 1) {
				es->len += len;
				es = ext4_es_try_to_merge_right(tree, es);
				goto out;
			}
			p = &(*p)->rb_right;
		} else {
			if (extent_status_end(es) <= end)
				es->len = offset - es->start + len;
			goto out;
		}
	}

	es = ext4_es_alloc_extent(offset, len);
	if (!es)
		return -ENOMEM;
	rb_link_node(&es->rb_node, parent, p);
	rb_insert_color(&es->rb_node, &tree->root);

out:
	tree->cache_es = es;
	return 0;
}

/*
 * ext4_es_insert_extent() adds a space to a delayed extent tree.
 * Caller holds inode->i_es_lock.
 *
 * ext4_es_insert_extent is called by ext4_da_write_begin and
 * ext4_es_remove_extent.
 *
 * Return 0 on success, error code on failure.
 */
int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
			  ext4_lblk_t len)
{
	struct ext4_es_tree *tree;
	int err = 0;

	trace_ext4_es_insert_extent(inode, offset, len);
	es_debug("add [%u/%u) to extent status tree of inode %lu\n",
		 offset, len, inode->i_ino);

	write_lock(&EXT4_I(inode)->i_es_lock);
	tree = &EXT4_I(inode)->i_es_tree;
	err = __es_insert_extent(tree, offset, len);
	write_unlock(&EXT4_I(inode)->i_es_lock);

	ext4_es_print_tree(inode);

	return err;
}

/*
 * ext4_es_remove_extent() removes a space from a delayed extent tree.
 * Caller holds inode->i_es_lock.
 *
 * Return 0 on success, error code on failure.
 */
int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
			  ext4_lblk_t len)
{
	struct rb_node *node;
	struct ext4_es_tree *tree;
	struct extent_status *es;
	struct extent_status orig_es;
	ext4_lblk_t len1, len2, end;
	int err = 0;

	trace_ext4_es_remove_extent(inode, offset, len);
	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
		 offset, len, inode->i_ino);

	end = offset + len - 1;
	BUG_ON(end < offset);
	write_lock(&EXT4_I(inode)->i_es_lock);
	tree = &EXT4_I(inode)->i_es_tree;
	es = __es_tree_search(&tree->root, offset);
	if (!es)
		goto out;
	if (es->start > end)
		goto out;

	/* Simply invalidate cache_es. */
	tree->cache_es = NULL;

	orig_es.start = es->start;
	orig_es.len = es->len;
	len1 = offset > es->start ? offset - es->start : 0;
	len2 = extent_status_end(es) > end ?
	       extent_status_end(es) - end : 0;
	if (len1 > 0)
		es->len = len1;
	if (len2 > 0) {
		if (len1 > 0) {
			err = __es_insert_extent(tree, end + 1, len2);
			if (err) {
				es->start = orig_es.start;
				es->len = orig_es.len;
				goto out;
			}
		} else {
			es->start = end + 1;
			es->len = len2;
		}
		goto out;
	}

	if (len1 > 0) {
		node = rb_next(&es->rb_node);
		if (node)
			es = rb_entry(node, struct extent_status, rb_node);
		else
			es = NULL;
	}

	while (es && extent_status_end(es) <= end) {
		node = rb_next(&es->rb_node);
		rb_erase(&es->rb_node, &tree->root);
		ext4_es_free_extent(es);
		if (!node) {
			es = NULL;
			break;
		}
		es = rb_entry(node, struct extent_status, rb_node);
	}

	if (es && es->start < end + 1) {
		len1 = extent_status_end(es) - end;
		es->start = end + 1;
		es->len = len1;
	}

out:
	write_unlock(&EXT4_I(inode)->i_es_lock);
	ext4_es_print_tree(inode);
	return err;
}