1 files changed, 316 insertions, 32 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 84bf6e79de23..34d10452c56d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -206,9 +206,70 @@ bail:
 	return err;
 }
 
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+			   struct buffer_head *di_bh)
+{
+	void *kaddr;
+	unsigned int size;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
+		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		return -EROFS;
+	}
+
+	size = i_size_read(inode);
+
+	if (size > PAGE_CACHE_SIZE ||
+	    size > ocfs2_max_inline_data(inode->i_sb)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has with inline data has bad size: %u",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
+		return -EROFS;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (size)
+		memcpy(kaddr, di->id2.i_data.id_data, size);
+	/* Clear the remaining part of the page */
+	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+
+	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
+			       OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_inline_data(inode, page, di_bh);
+out:
+	unlock_page(page);
+
+	brelse(di_bh);
+	return ret;
+}
+
 static int ocfs2_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 
@@ -222,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 		goto out;
 	}
 
-	if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) {
+	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
 		ret = AOP_TRUNCATED_PAGE;
 		goto out_meta_unlock;
 	}
@@ -232,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	 * might now be discovering a truncate that hit on another node.
 	 * block_read_full_page->get_block freaks out if it is asked to read
 	 * beyond the end of a file, so we check here.  Callers
-	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
 	 * and notice that the page they just read isn't needed.
 	 *
 	 * XXX sys_readahead() seems to get that wrong?
@@ -252,7 +313,10 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 		goto out_alloc;
 	}
 
-	ret = block_read_full_page(page, ocfs2_get_block);
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		ret = ocfs2_readpage_inline(inode, page);
+	else
+		ret = block_read_full_page(page, ocfs2_get_block);
 	unlock = 0;
 
 	ocfs2_data_unlock(inode, 0);
@@ -301,12 +365,8 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 {
 	int ret;
 
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
 	ret = block_prepare_write(page, from, to, ocfs2_get_block);
 
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
 	return ret;
 }
 
@@ -401,7 +461,9 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 
-	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+						  NULL);
 
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -415,7 +477,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 		goto bail;
 	}
 
-
 bail:
 	status = err ? 0 : p_blkno;
 
@@ -570,6 +631,13 @@ static ssize_t ocfs2_direct_IO(int rw,
 
 	mlog_entry_void();
 
+	/*
+	 * Fallback to buffered I/O if we see an inode without
+	 * extents.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
 		/*
 		 * We get PR data locks even for O_DIRECT.  This
@@ -834,18 +902,22 @@ struct ocfs2_write_ctxt {
 	struct ocfs2_cached_dealloc_ctxt w_dealloc;
 };
 
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
 {
 	int i;
 
-	for(i = 0; i < wc->w_num_pages; i++) {
-		if (wc->w_pages[i] == NULL)
-			continue;
-
-		unlock_page(wc->w_pages[i]);
-		mark_page_accessed(wc->w_pages[i]);
-		page_cache_release(wc->w_pages[i]);
+	for(i = 0; i < num_pages; i++) {
+		if (pages[i]) {
+			unlock_page(pages[i]);
+			mark_page_accessed(pages[i]);
+			page_cache_release(pages[i]);
+		}
 	}
+}
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 
 	brelse(wc->w_di_bh);
 	kfree(wc);
@@ -855,6 +927,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 				  struct ocfs2_super *osb, loff_t pos,
 				  unsigned len, struct buffer_head *di_bh)
 {
+	u32 cend;
 	struct ocfs2_write_ctxt *wc;
 
 	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
@@ -862,7 +935,8 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 		return -ENOMEM;
 
 	wc->w_cpos = pos >> osb->s_clustersize_bits;
-	wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+	cend = (pos + len - 1) >> osb->s_clustersize_bits;
+	wc->w_clen = cend - wc->w_cpos + 1;
 	get_bh(di_bh);
 	wc->w_di_bh = di_bh;
 
@@ -928,18 +1002,11 @@ static void ocfs2_write_failure(struct inode *inode,
 				loff_t user_pos, unsigned user_len)
 {
 	int i;
-	unsigned from, to;
+	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+		to = user_pos + user_len;
 	struct page *tmppage;
 
-	ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
-
-	if (wc->w_large_pages) {
-		from = wc->w_target_from;
-		to = wc->w_target_to;
-	} else {
-		from = 0;
-		to = PAGE_CACHE_SIZE;
-	}
+	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
@@ -989,9 +1056,6 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
 			map_from = cluster_start;
 			map_to = cluster_end;
 		}
-
-		wc->w_target_from = map_from;
-		wc->w_target_to = map_to;
 	} else {
 		/*
 		 * If we haven't allocated the new page yet, we
@@ -1209,18 +1273,33 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 				       loff_t pos, unsigned len)
 {
 	int ret, i;
+	loff_t cluster_off;
+	unsigned int local_len = len;
 	struct ocfs2_write_cluster_desc *desc;
+	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
 
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 
+		/*
+		 * We have to make sure that the total write passed in
+		 * doesn't extend past a single cluster.
+		 */
+		local_len = len;
+		cluster_off = pos & (osb->s_clustersize - 1);
+		if ((cluster_off + local_len) > osb->s_clustersize)
+			local_len = osb->s_clustersize - cluster_off;
+
 		ret = ocfs2_write_cluster(mapping, desc->c_phys,
 					  desc->c_unwritten, data_ac, meta_ac,
-					  wc, desc->c_cpos, pos, len);
+					  wc, desc->c_cpos, pos, local_len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
+
+		len -= local_len;
+		pos += local_len;
 	}
 
 	ret = 0;
@@ -1353,6 +1432,160 @@ out:
 	return ret;
 }
 
+static int ocfs2_write_begin_inline(struct address_space *mapping,
+				    struct inode *inode,
+				    struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct page *page;
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+	page = find_or_create_page(mapping, 0, GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	/*
+	 * If we don't set w_num_pages then this page won't get unlocked
+	 * and freed on cleanup of the write context.
+	 */
+	wc->w_pages[0] = wc->w_target_page = page;
+	wc->w_num_pages = 1;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		ocfs2_commit_trans(osb, handle);
+
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		ocfs2_set_inode_data_inline(inode, di);
+
+	if (!PageUptodate(page)) {
+		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
+		if (ret) {
+			ocfs2_commit_trans(osb, handle);
+
+			goto out;
+		}
+	}
+
+	wc->w_handle = handle;
+out:
+	return ret;
+}
+
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (new_size < le16_to_cpu(di->id2.i_data.id_count))
+		return 1;
+	return 0;
+}
+
+static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
+					  struct inode *inode, loff_t pos,
+					  unsigned len, struct page *mmap_page,
+					  struct ocfs2_write_ctxt *wc)
+{
+	int ret, written = 0;
+	loff_t end = pos + len;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
+	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
+	     oi->ip_dyn_features);
+
+	/*
+	 * Handle inodes which already have inline data 1st.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		if (mmap_page == NULL &&
+		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
+			goto do_inline_write;
+
+		/*
+		 * The write won't fit - we have to give this inode an
+		 * inline extent list now.
+		 */
+		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Check whether the inode can accept inline data.
+	 */
+	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
+		return 0;
+
+	/*
+	 * Check whether the write can fit.
+	 */
+	if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
+		return 0;
+
+do_inline_write:
+	ret = ocfs2_write_begin_inline(mapping, inode, wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * This signals to the caller that the data can be written
+	 * inline.
+	 */
+	written = 1;
+out:
+	return written ? written : ret;
+}
+
+/*
+ * This function only does anything for file systems which can't
+ * handle sparse files.
+ *
+ * What we want to do here is fill in any hole between the current end
+ * of allocation and the end of our write. That way the rest of the
+ * write path can treat it as an non-allocating write, which has no
+ * special case code for sparse/nonsparse files.
+ */
+static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
+					unsigned len,
+					struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	loff_t newsize = pos + len;
+
+	if (ocfs2_sparse_alloc(osb))
+		return 0;
+
+	if (newsize <= i_size_read(inode))
+		return 0;
+
+	ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
 int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
@@ -1374,6 +1607,25 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		return ret;
 	}
 
+	if (ocfs2_supports_inline_data(osb)) {
+		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
+						     mmap_page, wc);
+		if (ret == 1) {
+			ret = 0;
+			goto success;
+		}
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
 					&extents_to_split);
 	if (ret) {
@@ -1455,6 +1707,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+success:
 	*pagep = wc->w_target_page;
 	*fsdata = wc;
 	return 0;
@@ -1522,6 +1775,31 @@ out_fail:
 	return ret;
 }
 
+static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
+				   unsigned len, unsigned *copied,
+				   struct ocfs2_dinode *di,
+				   struct ocfs2_write_ctxt *wc)
+{
+	void *kaddr;
+
+	if (unlikely(*copied < len)) {
+		if (!PageUptodate(wc->w_target_page)) {
+			*copied = 0;
+			return;
+		}
+	}
+
+	kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
+	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mlog(0, "Data written to inode at offset %llu. "
+	     "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+	     (unsigned long long)pos, *copied,
+	     le16_to_cpu(di->id2.i_data.id_count),
+	     le16_to_cpu(di->i_dyn_features));
+}
+
 int ocfs2_write_end_nolock(struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata)
@@ -1535,6 +1813,11 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	handle_t *handle = wc->w_handle;
 	struct page *tmppage;
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
+		goto out_write_size;
+	}
+
 	if (unlikely(copied < len)) {
 		if (!PageUptodate(wc->w_target_page))
 			copied = 0;
@@ -1572,6 +1855,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		block_commit_write(tmppage, from, to);
 	}
 
+out_write_size:
 	pos += copied;
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);