summaryrefslogtreecommitdiff
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c332
1 files changed, 228 insertions, 104 deletions
diff --git a/fs/dax.c b/fs/dax.c
index f001d8c72a06..95981591977a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
static void *dax_insert_mapping_entry(struct address_space *mapping,
struct vm_fault *vmf,
void *entry, sector_t sector,
- unsigned long flags)
+ unsigned long flags, bool dirty)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
void *new_entry;
pgoff_t index = vmf->pgoff;
- if (vmf->flags & FAULT_FLAG_WRITE)
+ if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -565,11 +565,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
ret = __radix_tree_lookup(page_tree, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
__radix_tree_replace(page_tree, node, slot,
- new_entry, NULL, NULL);
+ new_entry, NULL);
entry = new_entry;
}
- if (vmf->flags & FAULT_FLAG_WRITE)
+ if (dirty)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
@@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
continue;
+ /*
+ * No need to call mmu_notifier_invalidate_range() as we are
+ * downgrading page table protection not changing it to point
+ * to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
if (pmdp) {
#ifdef CONFIG_FS_DAX_PMD
pmd_t pmd;
@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- mmu_notifier_invalidate_range(vma->vm_mm, start, end);
unlock_pmd:
spin_unlock(ptl);
#endif
@@ -643,7 +649,6 @@ unlock_pmd:
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
- mmu_notifier_invalidate_range(vma->vm_mm, start, end);
unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
@@ -789,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
tag_pages_for_writeback(mapping, start_index, end_index);
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
while (!done) {
pvec.nr = find_get_entries_tag(mapping, start_index,
PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
@@ -820,38 +825,42 @@ out:
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static int dax_insert_mapping(struct address_space *mapping,
- struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, size_t size, void *entry,
- struct vm_area_struct *vma, struct vm_fault *vmf)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
- unsigned long vaddr = vmf->address;
- void *ret, *kaddr;
+ return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+}
+
+static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+ pfn_t *pfnp)
+{
+ const sector_t sector = dax_iomap_sector(iomap, pos);
pgoff_t pgoff;
+ void *kaddr;
int id, rc;
- pfn_t pfn;
+ long length;
- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
if (rc)
return rc;
-
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
+ length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
+ &kaddr, pfnp);
+ if (length < 0) {
+ rc = length;
+ goto out;
}
+ rc = -EINVAL;
+ if (PFN_PHYS(length) < size)
+ goto out;
+ if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+ goto out;
+ /* For larger pages we need devmap */
+ if (length > 1 && !pfn_t_devmap(*pfnp))
+ goto out;
+ rc = 0;
+out:
dax_read_unlock(id);
-
- ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
- if (IS_ERR(ret))
- return PTR_ERR(ret);
-
- trace_dax_insert_mapping(mapping->host, vmf, ret);
- if (vmf->flags & FAULT_FLAG_WRITE)
- return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
- else
- return vm_insert_mixed(vma, vaddr, pfn);
+ return rc;
}
/*
@@ -877,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
}
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
- RADIX_DAX_ZERO_PAGE);
+ RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS;
goto out;
@@ -936,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
-{
- return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
-}
-
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
@@ -1080,19 +1084,33 @@ static int dax_fault_return(int error)
return VM_FAULT_SIGBUS;
}
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(unsigned long flags,
+ struct vm_area_struct *vma, struct iomap *iomap)
+{
+ return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
+ && (iomap->flags & IOMAP_F_DIRTY);
+}
+
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ struct vm_area_struct *vma = vmf->vma;
+ struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
- sector_t sector;
struct iomap iomap = { 0 };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool sync;
int vmf_ret = 0;
void *entry;
+ pfn_t pfn;
trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
@@ -1105,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto out;
}
- if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+ if (write && !vmf->cow_page)
flags |= IOMAP_WRITE;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1140,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto error_finish_iomap;
}
- sector = dax_iomap_sector(&iomap, pos);
-
if (vmf->cow_page) {
+ sector_t sector = dax_iomap_sector(&iomap, pos);
+
switch (iomap.type) {
case IOMAP_HOLE:
case IOMAP_UNWRITTEN:
@@ -1168,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto finish_iomap;
}
+ sync = dax_fault_is_synchronous(flags, vma, &iomap);
+
switch (iomap.type) {
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
- error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
- sector, PAGE_SIZE, entry, vmf->vma, vmf);
+ error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+ if (error < 0)
+ goto error_finish_iomap;
+
+ entry = dax_insert_mapping_entry(mapping, vmf, entry,
+ dax_iomap_sector(&iomap, pos),
+ 0, write && !sync);
+ if (IS_ERR(entry)) {
+ error = PTR_ERR(entry);
+ goto error_finish_iomap;
+ }
+
+ /*
+ * If we are doing synchronous page fault and inode needs fsync,
+ * we can insert PTE into page tables only after that happens.
+ * Skip insertion for now and return the pfn so that caller can
+ * insert it after fsync is done.
+ */
+ if (sync) {
+ if (WARN_ON_ONCE(!pfnp)) {
+ error = -EIO;
+ goto error_finish_iomap;
+ }
+ *pfnp = pfn;
+ vmf_ret = VM_FAULT_NEEDDSYNC | major;
+ goto finish_iomap;
+ }
+ trace_dax_insert_mapping(inode, vmf, entry);
+ if (write)
+ error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ else
+ error = vm_insert_mixed(vma, vaddr, pfn);
+
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
- if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+ if (!write) {
vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
@@ -1218,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}
#ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
- loff_t pos, void *entry)
-{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- const sector_t sector = dax_iomap_sector(iomap, pos);
- struct dax_device *dax_dev = iomap->dax_dev;
- struct block_device *bdev = iomap->bdev;
- struct inode *inode = mapping->host;
- const size_t size = PMD_SIZE;
- void *ret = NULL, *kaddr;
- long length = 0;
- pgoff_t pgoff;
- pfn_t pfn = {};
- int id;
-
- if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
- goto fallback;
-
- id = dax_read_lock();
- length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
- if (length < 0)
- goto unlock_fallback;
- length = PFN_PHYS(length);
-
- if (length < size)
- goto unlock_fallback;
- if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
- goto unlock_fallback;
- if (!pfn_t_devmap(pfn))
- goto unlock_fallback;
- dax_read_unlock(id);
-
- ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
- RADIX_DAX_PMD);
- if (IS_ERR(ret))
- goto fallback;
-
- trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
- return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
- pfn, vmf->flags & FAULT_FLAG_WRITE);
-
-unlock_fallback:
- dax_read_unlock(id);
-fallback:
- trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
- return VM_FAULT_FALLBACK;
-}
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void *entry)
@@ -1283,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback;
ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
+ RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret))
goto fallback;
@@ -1305,13 +1314,14 @@ fallback:
return VM_FAULT_FALLBACK;
}
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool sync;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
int result = VM_FAULT_FALLBACK;
@@ -1320,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
void *entry;
loff_t pos;
int error;
+ pfn_t pfn;
/*
* Check whether offset isn't beyond end of file now. Caller is
@@ -1327,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* this is a reliable test.
*/
pgoff = linear_page_index(vma, pmd_addr);
- max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1351,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- if (pgoff > max_pgoff) {
+ if (pgoff >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}
/* If the PMD would extend beyond the file size */
- if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+ if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
goto fallback;
/*
@@ -1395,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
+ sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
+
switch (iomap.type) {
case IOMAP_MAPPED:
- result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+ error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+ if (error < 0)
+ goto finish_iomap;
+
+ entry = dax_insert_mapping_entry(mapping, vmf, entry,
+ dax_iomap_sector(&iomap, pos),
+ RADIX_DAX_PMD, write && !sync);
+ if (IS_ERR(entry))
+ goto finish_iomap;
+
+ /*
+ * If we are doing synchronous page fault and inode needs fsync,
+ * we can insert PMD into page tables only after that happens.
+ * Skip insertion for now and return the pfn so that caller can
+ * insert it after fsync is done.
+ */
+ if (sync) {
+ if (WARN_ON_ONCE(!pfnp))
+ goto finish_iomap;
+ *pfnp = pfn;
+ result = VM_FAULT_NEEDDSYNC;
+ goto finish_iomap;
+ }
+
+ trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+ result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+ write);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
@@ -1437,7 +1476,7 @@ out:
return result;
}
#else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -1447,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
/**
* dax_iomap_fault - handle a page fault on a DAX file
* @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
+ * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
+ * @ops: Iomap ops passed from the file system
*
* When a page fault occurs, filesystems may call this helper in
* their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1455,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* successfully.
*/
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
- const struct iomap_ops *ops)
+ pfn_t *pfnp, const struct iomap_ops *ops)
{
switch (pe_size) {
case PE_SIZE_PTE:
- return dax_iomap_pte_fault(vmf, ops);
+ return dax_iomap_pte_fault(vmf, pfnp, ops);
case PE_SIZE_PMD:
- return dax_iomap_pmd_fault(vmf, ops);
+ return dax_iomap_pmd_fault(vmf, pfnp, ops);
default:
return VM_FAULT_FALLBACK;
}
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file. It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ pfn_t pfn)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ void *entry, **slot;
+ pgoff_t index = vmf->pgoff;
+ int vmf_ret, error;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* Did we race with someone splitting entry or so? */
+ if (!entry ||
+ (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+ (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+ put_unlocked_mapping_entry(mapping, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+ VM_FAULT_NOPAGE);
+ return VM_FAULT_NOPAGE;
+ }
+ radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+ entry = lock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ switch (pe_size) {
+ case PE_SIZE_PTE:
+ error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ vmf_ret = dax_fault_return(error);
+ break;
+#ifdef CONFIG_FS_DAX_PMD
+ case PE_SIZE_PMD:
+ vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+ pfn, true);
+ break;
+#endif
+ default:
+ vmf_ret = VM_FAULT_FALLBACK;
+ }
+ put_locked_mapping_entry(mapping, index);
+ trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+ return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+ pfn_t pfn)
+{
+ int err;
+ loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+ size_t len = 0;
+
+ if (pe_size == PE_SIZE_PTE)
+ len = PAGE_SIZE;
+ else if (pe_size == PE_SIZE_PMD)
+ len = PMD_SIZE;
+ else
+ WARN_ON_ONCE(1);
+ err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+ if (err)
+ return VM_FAULT_SIGBUS;
+ return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);