diff options
Diffstat (limited to 'fs')
152 files changed, 4744 insertions, 3582 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 2bc7ad775842..3ef62bad8f2b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -79,6 +79,7 @@ config EXPORTFS_BLOCK_OPS config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y + select PERCPU_RWSEM help This option enables standard file locking support, required for filesystems like NFS and for the flock() system diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c7efddf6e038..4c09d93d9569 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -89,7 +89,7 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU || M68K + depends on !MMU || ARM || M68K depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 7ef637d7f3a5..1e9d2f84e5b5 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -461,8 +461,8 @@ static void afs_callback_updater(struct work_struct *work) */ int __init afs_callback_update_init(void) { - afs_callback_update_worker = - create_singlethread_workqueue("kafs_callbackd"); + afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd", + WQ_MEM_RECLAIM); return afs_callback_update_worker ? 0 : -ENOMEM; } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 85737e96ab8b..2037e7a77a37 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -17,19 +17,12 @@ #include "internal.h" #include "afs_cm.h" -#if 0 -struct workqueue_struct *afs_cm_workqueue; -#endif /* 0 */ - -static int afs_deliver_cb_init_call_back_state(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_init_call_back_state3(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, - struct sk_buff *, bool); +static int afs_deliver_cb_init_call_back_state(struct afs_call *); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *); +static int afs_deliver_cb_probe(struct afs_call *); +static int afs_deliver_cb_callback(struct afs_call *); +static int afs_deliver_cb_probe_uuid(struct afs_call *); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *); static void afs_cm_destructor(struct afs_call *); /* @@ -134,7 +127,7 @@ static void afs_cm_destructor(struct afs_call *call) * received. The step number here must match the final number in * afs_deliver_cb_callback(). */ - if (call->unmarshall == 6) { + if (call->unmarshall == 5) { ASSERT(call->server && call->count && call->request); afs_break_callbacks(call->server, call->count, call->request); } @@ -168,27 +161,27 @@ static void SRXAFSCB_CallBack(struct work_struct *work) /* * deliver request data to a CB.CallBack call */ -static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_callback(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_callback *cb; struct afs_server *server; - struct in_addr addr; __be32 *bp; u32 tmp; int ret, loop; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); call->offset = 0; call->unmarshall++; /* extract the FID array and its count in two steps */ case 1: _debug("extract FID count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -205,8 +198,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 2: _debug("extract FID array"); - ret = afs_extract_data(call, skb, last, call->buffer, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, true); if (ret < 0) return ret; @@ -232,7 +225,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, /* extract the callback array and its count in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -242,13 +235,11 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, return -EBADMSG; call->offset = 0; call->unmarshall++; - if (tmp == 0) - goto empty_cb_array; case 4: _debug("extract CB array"); - ret = afs_extract_data(call, skb, last, call->request, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, false); if (ret < 0) return ret; @@ -261,15 +252,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, cb->type = ntohl(*bp++); } - empty_cb_array: call->offset = 0; call->unmarshall++; - case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - /* Record that the message was unmarshalled successfully so * that the call destructor can know do the callback breaking * work, even if the final ACK isn't received. @@ -278,17 +263,15 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, * updated also. */ call->unmarshall++; - case 6: + case 5: break; } - call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -315,17 +298,17 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) /* * deliver request data to a CB.InitCallBackState call */ -static int afs_deliver_cb_init_call_back_state(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); + + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -334,8 +317,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -348,27 +330,68 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* * deliver request data to a CB.InitCallBackState3 call */ -static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; + + _enter(""); + + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + + _enter("{%u}", call->unmarshall); - _enter(",{%u},%d", skb->len, last); + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; - /* There are some arguments that we ignore */ - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + break; + } /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -393,14 +416,13 @@ static void SRXAFSCB_Probe(struct work_struct *work) /* * deliver request data to a CB.Probe call */ -static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -426,7 +448,6 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) _enter(""); - if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) reply.match = htonl(0); else @@ -439,19 +460,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) /* * deliver request data to a CB.ProbeUuid call */ -static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe_uuid(struct afs_call *call) { struct afs_uuid *r; unsigned loop; __be32 *b; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -463,8 +479,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, case 1: _debug("extract UUID"); - ret = afs_extract_data(call, skb, last, call->buffer, - 11 * sizeof(__be32)); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -491,16 +507,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, call->unmarshall++; case 2: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); @@ -574,14 +583,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) /* * deliver request data to a CB.TellMeAboutYourself call */ -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index d91a9c9cfbd0..3191dff2c156 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -36,8 +36,8 @@ static int afs_init_lock_manager(void) if (!afs_lock_manager) { mutex_lock(&afs_lock_manager_mutex); if (!afs_lock_manager) { - afs_lock_manager = - create_singlethread_workqueue("kafs_lockd"); + afs_lock_manager = alloc_workqueue("kafs_lockd", + WQ_MEM_RECLAIM, 0); if (!afs_lock_manager) ret = -ENOMEM; } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 9312b92e54be..96f4d764d1a6 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -235,16 +235,15 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_status(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -307,8 +306,7 @@ int afs_fs_fetch_file_status(struct afs_server *server, /* * deliver reply data to an FS.FetchData */ -static int afs_deliver_fs_fetch_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; @@ -316,7 +314,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, void *buffer; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -332,7 +330,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, * client) */ case 1: _debug("extract data length (MSW)"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -347,7 +345,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the returned data length */ case 2: _debug("extract data length"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -363,10 +361,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, _debug("extract data"); if (call->count > 0) { page = call->reply3; - buffer = kmap_atomic(page); - ret = afs_extract_data(call, skb, last, buffer, - call->count); - kunmap_atomic(buffer); + buffer = kmap(page); + ret = afs_extract_data(call, buffer, + call->count, true); + kunmap(buffer); if (ret < 0) return ret; } @@ -376,8 +374,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the metadata */ case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - (21 + 3 + 6) * 4); + ret = afs_extract_data(call, call->buffer, + (21 + 3 + 6) * 4, false); if (ret < 0) return ret; @@ -391,18 +389,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, call->unmarshall++; case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; - buffer = kmap_atomic(page); + buffer = kmap(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer); + kunmap(buffer); } _leave(" = 0 [done]"); @@ -515,13 +510,12 @@ int afs_fs_fetch_data(struct afs_server *server, /* * deliver reply data to an FS.GiveUpCallBacks */ -static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_give_up_callbacks(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + _enter(""); /* shouldn't be any reply data */ - return afs_data_complete(call, skb, last); + return afs_extract_data(call, NULL, 0, false); } /* @@ -599,16 +593,15 @@ int afs_fs_give_up_callbacks(struct afs_server *server, /* * deliver reply data to an FS.CreateFile or an FS.MakeDir */ -static int afs_deliver_fs_create_vnode(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_create_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -696,16 +689,15 @@ int afs_fs_create(struct afs_server *server, /* * deliver reply data to an FS.RemoveFile or FS.RemoveDir */ -static int afs_deliver_fs_remove(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_remove(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -777,16 +769,15 @@ int afs_fs_remove(struct afs_server *server, /* * deliver reply data to an FS.Link */ -static int afs_deliver_fs_link(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_link(struct afs_call *call) { struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -863,16 +854,15 @@ int afs_fs_link(struct afs_server *server, /* * deliver reply data to an FS.Symlink */ -static int afs_deliver_fs_symlink(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_symlink(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -968,16 +958,15 @@ int afs_fs_symlink(struct afs_server *server, /* * deliver reply data to an FS.Rename */ -static int afs_deliver_fs_rename(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_rename(struct afs_call *call) { struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1072,16 +1061,15 @@ int afs_fs_rename(struct afs_server *server, /* * deliver reply data to an FS.StoreData */ -static int afs_deliver_fs_store_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1251,17 +1239,16 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, /* * deliver reply data to an FS.StoreStatus */ -static int afs_deliver_fs_store_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_status(struct afs_call *call) { afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1443,14 +1430,13 @@ int afs_fs_setattr(struct afs_server *server, struct key *key, /* * deliver reply data to an FS.GetVolumeStatus */ -static int afs_deliver_fs_get_volume_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_get_volume_status(struct afs_call *call) { const __be32 *bp; char *p; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -1460,8 +1446,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, skb, last, call->buffer, - 12 * 4); + ret = afs_extract_data(call, call->buffer, + 12 * 4, true); if (ret < 0) return ret; @@ -1472,7 +1458,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the volume name length */ case 2: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1487,8 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 3: _debug("extract volname"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1508,8 +1494,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1519,7 +1505,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the offline message length */ case 5: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1534,8 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 6: _debug("extract offline"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1555,8 +1541,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 7: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1566,7 +1552,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the message of the day length */ case 8: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1581,8 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 9: _debug("extract motd"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1595,26 +1581,17 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->unmarshall++; /* extract the message of the day padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_motd_padding; - } - call->count = 4 - (call->count & 3); + call->count = (4 - (call->count & 3)) & 3; case 10: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, false); if (ret < 0) return ret; call->offset = 0; call->unmarshall++; - no_motd_padding: - case 11: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } @@ -1685,15 +1662,14 @@ int afs_fs_get_volume_status(struct afs_server *server, /* * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock */ -static int afs_deliver_fs_xxxx_lock(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_xxxx_lock(struct afs_call *call) { const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index df976b2a7f40..5497c8496055 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -13,13 +13,13 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/skbuff.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> +#include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" @@ -56,7 +56,7 @@ struct afs_mount_params { */ struct afs_wait_mode { /* RxRPC received message notification */ - void (*rx_wakeup)(struct afs_call *call); + rxrpc_notify_rx_t notify_rx; /* synchronous call waiter and call dispatched notification */ int (*wait)(struct afs_call *call); @@ -75,10 +75,8 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ - struct sk_buff_head rx_queue; /* received packets */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct key *key; /* security for this call */ struct afs_server *server; /* server affected by incoming CM call */ @@ -92,6 +90,7 @@ struct afs_call { void *reply4; /* reply buffer (fourth part) */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ + size_t offset; /* offset into received data store */ enum { /* call state */ AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ @@ -99,21 +98,18 @@ struct afs_call { AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ AFS_CALL_REPLYING, /* replying to incoming call */ AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* successfully completed */ - AFS_CALL_BUSY, /* server was busy */ - AFS_CALL_ABORTED, /* call was aborted */ - AFS_CALL_ERROR, /* call failed due to error */ + AFS_CALL_COMPLETE, /* Completed or failed */ } state; int error; /* error code */ + u32 abort_code; /* Remote abort ID or 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ - unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ + bool need_attention; /* T if RxRPC poked us */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ __be32 operation_ID; /* operation ID for an incoming call */ @@ -128,8 +124,7 @@ struct afs_call_type { /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ - int (*deliver)(struct afs_call *call, struct sk_buff *skb, - bool last); + int (*deliver)(struct afs_call *call); /* map an abort code to an error number */ int (*abort_to_error)(u32 abort_code); @@ -607,29 +602,22 @@ extern void afs_proc_cell_remove(struct afs_cell *); /* * rxrpc.c */ +extern struct socket *afs_socket; + extern int afs_open_socket(void); extern void afs_close_socket(void); -extern void afs_data_consumed(struct afs_call *, struct sk_buff *); extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, const struct afs_wait_mode *); extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, - size_t); +extern int afs_extract_data(struct afs_call *, void *, size_t, bool); -static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb, - bool last) +static inline int afs_transfer_reply(struct afs_call *call) { - if (skb->len > 0) - return -EBADMSG; - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - return 0; + return afs_extract_data(call, call->buffer, call->reply_max, false); } /* @@ -654,7 +642,7 @@ do { \ extern struct afs_server *afs_lookup_server(struct afs_cell *, const struct in_addr *); -extern struct afs_server *afs_find_server(const struct in_addr *); +extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *); extern void afs_put_server(struct afs_server *); extern void __exit afs_purge_servers(void); diff --git a/fs/afs/main.c b/fs/afs/main.c index 35de0c04729f..0b187ef3b5b7 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/completion.h> #include <linux/sched.h> +#include <linux/random.h> #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 14d04c848465..59bdaa7527b6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -16,34 +16,36 @@ #include "internal.h" #include "afs_cm.h" -static struct socket *afs_socket; /* my RxRPC socket */ +struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; +static struct afs_call *afs_spare_incoming_call; static atomic_t afs_outstanding_calls; -static atomic_t afs_outstanding_skbs; -static void afs_wake_up_call_waiter(struct afs_call *); +static void afs_free_call(struct afs_call *); +static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static int afs_wait_for_call_to_complete(struct afs_call *); -static void afs_wake_up_async_call(struct afs_call *); +static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct afs_call *); -static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); -static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); +static void afs_process_async_call(struct work_struct *); +static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static int afs_deliver_cm_op_id(struct afs_call *); /* synchronous call management */ const struct afs_wait_mode afs_sync_call = { - .rx_wakeup = afs_wake_up_call_waiter, + .notify_rx = afs_wake_up_call_waiter, .wait = afs_wait_for_call_to_complete, }; /* asynchronous call management */ const struct afs_wait_mode afs_async_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, .wait = afs_dont_wait_for_call_to_complete, }; /* asynchronous incoming call management */ static const struct afs_wait_mode afs_async_incoming_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, }; /* asynchronous incoming call initial processing */ @@ -53,17 +55,9 @@ static const struct afs_call_type afs_RXCMxxxx = { .abort_to_error = afs_abort_to_error, }; -static void afs_collect_incoming_call(struct work_struct *); +static void afs_charge_preallocation(struct work_struct *); -static struct sk_buff_head afs_incoming_calls; -static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); - -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(call); -} +static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation); static int afs_wait_atomic_t(atomic_t *p) { @@ -83,10 +77,8 @@ int afs_open_socket(void) _enter(""); - skb_queue_head_init(&afs_incoming_calls); - ret = -ENOMEM; - afs_async_calls = create_singlethread_workqueue("kafsd"); + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); if (!afs_async_calls) goto error_0; @@ -110,13 +102,15 @@ int afs_open_socket(void) if (ret < 0) goto error_2; + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, + afs_rx_discard_new_call); + ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; - rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); - afs_socket = socket; + afs_charge_preallocation(NULL); _leave(" = 0"); return 0; @@ -136,52 +130,28 @@ void afs_close_socket(void) { _enter(""); + if (afs_spare_incoming_call) { + atomic_inc(&afs_outstanding_calls); + afs_free_call(afs_spare_incoming_call); + afs_spare_incoming_call = NULL; + } + + _debug("outstanding %u", atomic_read(&afs_outstanding_calls)); wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); + flush_workqueue(afs_async_calls); + kernel_sock_shutdown(afs_socket, SHUT_RDWR); + flush_workqueue(afs_async_calls); sock_release(afs_socket); _debug("dework"); destroy_workqueue(afs_async_calls); - - ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); _leave(""); } /* - * Note that the data in a socket buffer is now consumed. - */ -void afs_data_consumed(struct afs_call *call, struct sk_buff *skb) -{ - if (!skb) { - _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("DLVR %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - rxrpc_kernel_data_consumed(call->rxcall, skb); - } -} - -/* - * free a socket buffer - */ -static void afs_free_skb(struct sk_buff *skb) -{ - if (!skb) { - _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("FREE %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_free_skb(skb); - } -} - -/* * free a call */ static void afs_free_call(struct afs_call *call) @@ -191,7 +161,6 @@ static void afs_free_call(struct afs_call *call) ASSERTCMP(call->rxcall, ==, NULL); ASSERT(!work_pending(&call->async_work)); - ASSERT(skb_queue_empty(&call->rx_queue)); ASSERT(call->type->name != NULL); kfree(call->request); @@ -207,7 +176,7 @@ static void afs_free_call(struct afs_call *call) static void afs_end_call_nofree(struct afs_call *call) { if (call->rxcall) { - rxrpc_kernel_end_call(call->rxcall); + rxrpc_kernel_end_call(afs_socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) @@ -227,7 +196,7 @@ static void afs_end_call(struct afs_call *call) * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, - size_t request_size, size_t reply_size) + size_t request_size, size_t reply_max) { struct afs_call *call; @@ -241,7 +210,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, call->type = type; call->request_size = request_size; - call->reply_max = reply_size; + call->reply_max = reply_max; if (request_size) { call->request = kmalloc(request_size, GFP_NOFS); @@ -249,14 +218,13 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, goto nomem_free; } - if (reply_size) { - call->buffer = kmalloc(reply_size, GFP_NOFS); + if (reply_max) { + call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); return call; nomem_free: @@ -325,8 +293,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, * returns from sending the request */ if (first + loop >= last) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(call->rxcall, msg, - to - offset); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, + msg, to - offset); kunmap(pages[loop]); if (ret < 0) break; @@ -354,7 +322,6 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; - struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -366,8 +333,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -380,7 +346,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp); + (unsigned long) call, gfp, + wait_mode->notify_rx); call->key = NULL; if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); @@ -406,7 +373,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, * request */ if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + ret = rxrpc_kernel_send_data(afs_socket, rxcall, + &msg, call->request_size); if (ret < 0) goto error_do_abort; @@ -421,9 +389,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return wait_mode->wait(call); error_do_abort: - rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); error_kill_call: afs_end_call(call); _leave(" = %d", ret); @@ -431,140 +397,77 @@ error_kill_call: } /* - * Handles intercepted messages that were arriving in the socket's Rx queue. - * - * Called from the AF_RXRPC call processor in waitqueue process context. For - * each call, it is guaranteed this will be called in order of packet to be - * delivered. - */ -static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, - struct sk_buff *skb) -{ - struct afs_call *call = (struct afs_call *) user_call_ID; - - _enter("%p,,%u", call, skb->mark); - - _debug("ICPT %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - - ASSERTCMP(sk, ==, afs_socket->sk); - atomic_inc(&afs_outstanding_skbs); - - if (!call) { - /* its an incoming call for our callback service */ - skb_queue_tail(&afs_incoming_calls, skb); - queue_work(afs_wq, &afs_collect_incoming_call_work); - } else { - /* route the messages directly to the appropriate call */ - skb_queue_tail(&call->rx_queue, skb); - call->wait_mode->rx_wakeup(call); - } - - _leave(""); -} - -/* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { - struct sk_buff *skb; - bool last; u32 abort_code; int ret; - _enter(""); - - while ((call->state == AFS_CALL_AWAIT_REPLY || - call->state == AFS_CALL_AWAIT_OP_ID || - call->state == AFS_CALL_AWAIT_REQUEST || - call->state == AFS_CALL_AWAIT_ACK) && - (skb = skb_dequeue(&call->rx_queue))) { - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - _debug("Rcv DATA"); - last = rxrpc_kernel_is_data_last(skb); - ret = call->type->deliver(call, skb, last); - switch (ret) { - case -EAGAIN: - if (last) { - _debug("short data"); - goto unmarshal_error; - } - break; - case 0: - ASSERT(last); - if (call->state == AFS_CALL_AWAIT_REPLY) - call->state = AFS_CALL_COMPLETE; - break; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - goto do_abort; - case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; - goto do_abort; - default: - unmarshal_error: - abort_code = RXGEN_CC_UNMARSHAL; - if (call->state != AFS_CALL_AWAIT_REPLY) - abort_code = RXGEN_SS_UNMARSHAL; - do_abort: - rxrpc_kernel_abort_call(call->rxcall, - abort_code); - call->error = ret; - call->state = AFS_CALL_ERROR; - break; + _enter("%s", call->type->name); + + while (call->state == AFS_CALL_AWAIT_REPLY || + call->state == AFS_CALL_AWAIT_OP_ID || + call->state == AFS_CALL_AWAIT_REQUEST || + call->state == AFS_CALL_AWAIT_ACK + ) { + if (call->state == AFS_CALL_AWAIT_ACK) { + size_t offset = 0; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + NULL, 0, &offset, false, + &call->abort_code); + if (ret == -EINPROGRESS || ret == -EAGAIN) + return; + if (ret == 1) { + call->state = AFS_CALL_COMPLETE; + goto done; } - break; - case RXRPC_SKB_MARK_FINAL_ACK: - _debug("Rcv ACK"); - call->state = AFS_CALL_COMPLETE; - break; - case RXRPC_SKB_MARK_BUSY: - _debug("Rcv BUSY"); - call->error = -EBUSY; - call->state = AFS_CALL_BUSY; - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Rcv ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Loc ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_NET_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv NET ERROR %d", call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv LOCAL ERROR %d", call->error); - break; - default: - BUG(); - break; + return; } - afs_free_skb(skb); - } - - /* make sure the queue is empty if the call is done with (we might have - * aborted the call early because of an unmarshalling error) */ - if (call->state >= AFS_CALL_COMPLETE) { - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); - if (call->incoming) - afs_end_call(call); + ret = call->type->deliver(call); + switch (ret) { + case 0: + if (call->state == AFS_CALL_AWAIT_REPLY) + call->state = AFS_CALL_COMPLETE; + goto done; + case -EINPROGRESS: + case -EAGAIN: + goto out; + case -ENOTCONN: + abort_code = RX_CALL_DEAD; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KNC"); + goto do_abort; + case -ENOTSUPP: + abort_code = RX_INVALID_OPERATION; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KIV"); + goto do_abort; + case -ENODATA: + case -EBADMSG: + case -EMSGSIZE: + default: + abort_code = RXGEN_CC_UNMARSHAL; + if (call->state != AFS_CALL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, EBADMSG, "KUM"); + goto do_abort; + } } +done: + if (call->state == AFS_CALL_COMPLETE && call->incoming) + afs_end_call(call); +out: _leave(""); + return; + +do_abort: + call->error = ret; + call->state = AFS_CALL_COMPLETE; + goto done; } /* @@ -572,7 +475,7 @@ static void afs_deliver_to_call(struct afs_call *call) */ static int afs_wait_for_call_to_complete(struct afs_call *call) { - struct sk_buff *skb; + const char *abort_why; int ret; DECLARE_WAITQUEUE(myself, current); @@ -584,15 +487,18 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) set_current_state(TASK_INTERRUPTIBLE); /* deliver any messages that are in the queue */ - if (!skb_queue_empty(&call->rx_queue)) { + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } + abort_why = "KWC"; ret = call->error; - if (call->state >= AFS_CALL_COMPLETE) + if (call->state == AFS_CALL_COMPLETE) break; + abort_why = "KWI"; ret = -EINTR; if (signal_pending(current)) break; @@ -605,9 +511,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* kill the call */ if (call->state < AFS_CALL_COMPLETE) { _debug("call incomplete"); - rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_CALL_DEAD, -ret, abort_why); } _debug("call complete"); @@ -619,17 +524,24 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* * wake up a waiting call */ -static void afs_wake_up_call_waiter(struct afs_call *call) +static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; wake_up(&call->waitq); } /* * wake up an asynchronous call */ -static void afs_wake_up_async_call(struct afs_call *call) +static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - _enter(""); + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; queue_work(afs_async_calls, &call->async_work); } @@ -647,8 +559,10 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct afs_call *call) +static void afs_delete_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); afs_free_call(call); @@ -658,17 +572,19 @@ static void afs_delete_async_call(struct afs_call *call) /* * perform processing on an asynchronous call - * - on a multiple-thread workqueue this work item may try to run on several - * CPUs at the same time */ -static void afs_process_async_call(struct afs_call *call) +static void afs_process_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); - if (!skb_queue_empty(&call->rx_queue)) + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; afs_deliver_to_call(call); + } - if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { + if (call->state == AFS_CALL_COMPLETE && call->wait_mode) { if (call->wait_mode->async_complete) call->wait_mode->async_complete(call->reply, call->error); @@ -679,122 +595,93 @@ static void afs_process_async_call(struct afs_call *call) /* we can't just delete the call because the work item may be * queued */ - call->async_workfn = afs_delete_async_call; + call->async_work.func = afs_delete_async_call; queue_work(afs_async_calls, &call->async_work); } _leave(""); } -/* - * Empty a socket buffer into a flat reply buffer. - */ -int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last) +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { - size_t len = skb->len; - - if (len > call->reply_max - call->reply_size) { - _leave(" = -EBADMSG [%zu > %u]", - len, call->reply_max - call->reply_size); - return -EBADMSG; - } + struct afs_call *call = (struct afs_call *)user_call_ID; - if (len > 0) { - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, - len) < 0) - BUG(); - call->reply_size += len; - } - - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } - return 0; + call->rxcall = rxcall; } /* - * accept the backlog of incoming calls + * Charge the incoming call preallocation. */ -static void afs_collect_incoming_call(struct work_struct *work) +static void afs_charge_preallocation(struct work_struct *work) { - struct rxrpc_call *rxcall; - struct afs_call *call = NULL; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&afs_incoming_calls))) { - _debug("new call"); - - /* don't need the notification */ - afs_free_skb(skb); + struct afs_call *call = afs_spare_incoming_call; + for (;;) { if (!call) { call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); - if (!call) { - rxrpc_kernel_reject_call(afs_socket); - return; - } + if (!call) + break; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); call->wait_mode = &afs_async_incoming_call; call->type = &afs_RXCMxxxx; init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); call->state = AFS_CALL_AWAIT_OP_ID; - - _debug("CALL %p{%s} [%d]", - call, call->type->name, - atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); } - rxcall = rxrpc_kernel_accept_call(afs_socket, - (unsigned long) call); - if (!IS_ERR(rxcall)) { - call->rxcall = rxcall; - call = NULL; - } + if (rxrpc_kernel_charge_accept(afs_socket, + afs_wake_up_async_call, + afs_rx_attach, + (unsigned long)call, + GFP_KERNEL) < 0) + break; + call = NULL; } + afs_spare_incoming_call = call; +} + +/* + * Discard a preallocated call when a socket is shut down. + */ +static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + struct afs_call *call = (struct afs_call *)user_call_ID; - if (call) - afs_free_call(call); + atomic_inc(&afs_outstanding_calls); + call->rxcall = NULL; + afs_free_call(call); +} + +/* + * Notification of an incoming call. + */ +static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + atomic_inc(&afs_outstanding_calls); + queue_work(afs_wq, &afs_charge_preallocation_work); } /* * Grab the operation ID from an incoming cache manager call. The socket * buffer is discarded on error or if we don't yet have sufficient data. */ -static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cm_op_id(struct afs_call *call) { - size_t len = skb->len; - void *oibuf = (void *) &call->operation_ID; + int ret; - _enter("{%u},{%zu},%d", call->offset, len, last); + _enter("{%zu}", call->offset); ASSERTCMP(call->offset, <, 4); /* the operation ID forms the first four bytes of the request data */ - len = min_t(size_t, len, 4 - call->offset); - if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) - BUG(); - if (!pskb_pull(skb, len)) - BUG(); - call->offset += len; - - if (call->offset < 4) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; - } + ret = afs_extract_data(call, &call->operation_ID, 4, true); + if (ret < 0) + return ret; call->state = AFS_CALL_AWAIT_REQUEST; + call->offset = 0; /* ask the cache manager to route the call (it'll change the call type * if successful) */ @@ -803,7 +690,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, /* pass responsibility for the remainer of this message off to the * cache manager op */ - return call->type->deliver(call, skb, last); + return call->type->deliver(call); } /* @@ -823,14 +710,15 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); default: afs_end_call(call); _leave(" [error]"); @@ -859,7 +747,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); if (n >= 0) { /* Success */ _leave(" [replied]"); @@ -868,7 +756,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); } afs_end_call(call); _leave(" [error]"); @@ -877,25 +766,40 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) /* * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, struct sk_buff *skb, - bool last, void *buf, size_t count) +int afs_extract_data(struct afs_call *call, void *buf, size_t count, + bool want_more) { - size_t len = skb->len; + int ret; - _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + _enter("{%s,%zu},,%zu,%d", + call->type->name, call->offset, count, want_more); - ASSERTCMP(call->offset, <, count); + ASSERTCMP(call->offset, <=, count); - len = min_t(size_t, len, count - call->offset); - if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || - !pskb_pull(skb, len)) - BUG(); - call->offset += len; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + buf, count, &call->offset, + want_more, &call->abort_code); + if (ret == 0 || ret == -EAGAIN) + return ret; - if (call->offset < count) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; + if (ret == 1) { + switch (call->state) { + case AFS_CALL_AWAIT_REPLY: + call->state = AFS_CALL_COMPLETE; + break; + case AFS_CALL_AWAIT_REQUEST: + call->state = AFS_CALL_REPLYING; + break; + default: + break; + } + return 0; } - return 0; + + if (ret == -ECONNABORTED) + call->error = call->type->abort_to_error(call->abort_code); + else + call->error = ret; + call->state = AFS_CALL_COMPLETE; + return ret; } diff --git a/fs/afs/server.c b/fs/afs/server.c index f342acf3547d..d4066ab7dd55 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -178,13 +178,18 @@ server_in_two_cells: /* * look up a server by its IP address */ -struct afs_server *afs_find_server(const struct in_addr *_addr) +struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx) { struct afs_server *server = NULL; struct rb_node *p; - struct in_addr addr = *_addr; + struct in_addr addr = srx->transport.sin.sin_addr; - _enter("%pI4", &addr.s_addr); + _enter("{%d,%pI4}", srx->transport.family, &addr.s_addr); + + if (srx->transport.family != AF_INET) { + WARN(true, "AFS does not yes support non-IPv4 addresses\n"); + return NULL; + } read_lock(&afs_servers_lock); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index f94d1abdc3eb..94bcd97d22b8 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -58,17 +58,16 @@ static int afs_vl_abort_to_error(u32 abort_code) /* * deliver reply data to a VL.GetEntryByXXX call */ -static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call) { struct afs_cache_vlocation *entry; __be32 *bp; u32 tmp; int loop, ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 52976785a32c..45a86396fd2d 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -594,8 +594,8 @@ static void afs_vlocation_reaper(struct work_struct *work) */ int __init afs_vlocation_update_init(void) { - afs_vlocation_update_worker = - create_singlethread_workqueue("kafs_vlupdated"); + afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated", + WQ_MEM_RECLAIM, 0); return afs_vlocation_update_worker ? 0 : -ENOMEM; } @@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); + struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, + AIO_RING_MAGIC); + + if (!IS_ERR(root)) + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } /* aio_setup diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index b493909e7492..d8e6d421c27f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry, } return NULL; } + /* * Find an eligible tree to time-out * A tree is eligible if :- @@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, struct dentry *root = sb->s_root; struct dentry *dentry; struct dentry *expired; + struct dentry *found; struct autofs_info *ino; if (!root) @@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { + int flags = how; + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_WANT_EXPIRE) - expired = NULL; - else - expired = should_expire(dentry, mnt, timeout, how); - if (!expired) { + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; } + spin_unlock(&sbi->fs_lock); + + expired = should_expire(dentry, mnt, timeout, flags); + if (!expired) + continue; + + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); - if (should_expire(expired, mnt, timeout, how)) { - if (expired != dentry) - dput(dentry); - goto found; - } + /* Make sure a reference is not taken on found if + * things have changed. + */ + flags &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); + if (!found || found != expired) + /* Something has changed, continue */ + goto next; + + if (expired != dentry) + dput(dentry); + + spin_lock(&sbi->fs_lock); + goto found; +next: + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); if (expired != dentry) dput(expired); - spin_unlock(&sbi->fs_lock); } return NULL; @@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; + int state; /* Block on any pending expire */ if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) @@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (rcu_walk) return -ECHILD; +retry: spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { + state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); + if (state == AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + /* + * Possibly being selected for expire, wait until + * it's selected or not. + */ + schedule_timeout_uninterruptible(HZ/10); + goto retry; + } + if (state & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 431fd7ee3488..e44271dfceb6 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_uid(); - wq->gid = current_gid(); + wq->uid = current_real_cred()->uid; + wq->gid = current_real_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e5495f37c6ed..2472af2798c7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1624,20 +1624,12 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } -#ifndef PR_REG_SIZE -#define PR_REG_SIZE(S) sizeof(S) -#endif - #ifndef PRSTATUS_SIZE -#define PRSTATUS_SIZE(S) sizeof(S) -#endif - -#ifndef PR_REG_PTR -#define PR_REG_PTR(S) (&((S)->pr_reg)) +#define PRSTATUS_SIZE(S, R) sizeof(S) #endif #ifndef SET_PR_FPVALID -#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, @@ -1645,6 +1637,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; + unsigned int regset_size = view->regsets[0].n * view->regsets[0].size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1653,12 +1646,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, PR_REG_SIZE(t->prstatus.pr_reg), - PR_REG_PTR(&t->prstatus), NULL); + (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size, + &t->prstatus.pr_reg, NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1688,7 +1680,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - SET_PR_FPVALID(&t->prstatus, 1); + SET_PR_FPVALID(&t->prstatus, + 1, regset_size); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eff3993c77b3..33fe03551105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -427,6 +427,7 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8c8a4d1e02b9..665da8f66ff1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4271,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) if (ret < 0) return ret; - /* - * Use new btrfs_qgroup_reserve_data to reserve precious data space - * - * TODO: Find a good method to avoid reserve data space for NOCOW - * range, but don't impact performance on quota disable case. - */ + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, start, len); + if (ret) + btrfs_free_reserved_data_space_noquota(inode, start, len); return ret; } @@ -4966,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head) */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { - struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; int commit_cycles = 0; + u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -4984,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - last_ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); + last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -5005,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (last_ticket == ticket) { + if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { - last_ticket = ticket; + last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; @@ -5384,6 +5380,7 @@ again: list_del_init(&ticket->list); num_bytes -= ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { ticket->bytes -= num_bytes; @@ -5426,6 +5423,7 @@ again: num_bytes -= ticket->bytes; space_info->bytes_may_use += ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { trace_btrfs_space_reservation(fs_info, "space_info", @@ -8216,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; /* * Mixed block groups will exclude before processing the log so we only @@ -8231,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_add_reserved_bytes(block_group, ins->offset, - ins->offset, 0); - BUG_ON(ret); /* logic error */ + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b2a2da5893af..7fd939bfbd99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, int namelen; int ret = 0; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; + if (!S_ISDIR(dir->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e935035ac034..ef9c55bc7907 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2867,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c64a0b794d49..df4b3e6fa563 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when * dentries are sotred in hash order */ - } else if (fi->frag |= fpos_frag(new_pos)) { + } else if (fi->frag != fpos_frag(new_pos)) { return true; } rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 6bbec5e784cd..14ae4b8e1a3c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *s, *p; char sep; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + return dget(sb->s_root); + full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) @@ -686,26 +689,22 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; + goto out_free; } - if (volume_info->prepath) { - cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); - if (cifs_sb->prepath == NULL) { - root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; - } + rc = cifs_setup_cifs_sb(volume_info, cifs_sb); + if (rc) { + root = ERR_PTR(rc); + goto out_free; } - cifs_setup_cifs_sb(volume_info, cifs_sb); - rc = cifs_mount(cifs_sb, volume_info); if (rc) { if (!(flags & MS_SILENT)) cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", rc); root = ERR_PTR(rc); - goto out_mountdata; + goto out_free; } mnt_data.vol = volume_info; @@ -735,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) - root = dget(sb->s_root); - else - root = cifs_get_root(volume_info, sb); - + root = cifs_get_root(volume_info, sb); if (IS_ERR(root)) goto out_super; @@ -752,9 +747,9 @@ out: cifs_cleanup_volume_info(volume_info); return root; -out_mountdata: +out_free: + kfree(cifs_sb->prepath); kfree(cifs_sb->mountdata); -out_cifs_sb: kfree(cifs_sb); out_nls: unload_nls(volume_info->local_nls); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1243bd326591..95dab43646f0 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int to_read); -extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae03283bd61..2e4f4bad8b1e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) return 1; } +static int +match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) +{ + struct cifs_sb_info *old = CIFS_SB(sb); + struct cifs_sb_info *new = mnt_data->cifs_sb; + + if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { + if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) + return 0; + /* The prepath should be null terminated strings */ + if (strcmp(new->prepath, old->prepath)) + return 0; + + return 1; + } + return 0; +} + int cifs_match_super(struct super_block *sb, void *data) { @@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data) if (!match_server(tcp_srv, volume_info) || !match_session(ses, volume_info) || - !match_tcon(tcon, volume_info->UNC)) { + !match_tcon(tcon, volume_info->UNC) || + !match_prepath(sb, mnt_data)) { rc = 0; goto out; } @@ -3222,7 +3241,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, } } -void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb) { INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); @@ -3316,6 +3335,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); + + if (pvolume_info->prepath) { + cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL); + if (cifs_sb->prepath == NULL) + return -ENOMEM; + } + + return 0; } static void diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c30cf49b69d2..2c6312db8516 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf, if (bin_attr->cb_max_size && *ppos + count > bin_attr->cb_max_size) { len = -EFBIG; + goto out; } tbuf = vmalloc(*ppos + count); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 0f9961eede1e..ed115acb5dee 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,6 +11,7 @@ #include <linux/random.h> #include <linux/string.h> #include <linux/fscrypto.h> +#include <linux/mount.h> static int inode_has_encryption_context(struct inode *inode) { @@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct inode *inode, +int fscrypt_process_policy(struct file *filp, const struct fscrypt_policy *policy) { + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + if (policy->version != 0) return -EINVAL; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!inode_has_encryption_context(inode)) { - if (!inode->i_sb->s_cop->empty_dir) - return -EOPNOTSUPP; - if (!inode->i_sb->s_cop->empty_dir(inode)) - return -ENOTEMPTY; - return create_encryption_context_from_policy(inode, policy); + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; } - if (is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; + mnt_drop_write_file(filp); + return ret; } EXPORT_SYMBOL(fscrypt_process_policy); @@ -31,6 +31,8 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/iomap.h> +#include "internal.h" /* * We use lowest available bit in exceptional entry for locking, other two @@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry, return VM_FAULT_LOCKED; } -static int copy_user_bh(struct page *to, struct inode *inode, - struct buffer_head *bh, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, + struct page *to, unsigned long vaddr) { struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), - .size = bh->b_size, + .sector = sector, + .size = size, }; - struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) @@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, - struct buffer_head *bh, void **entryp, - struct vm_area_struct *vma, struct vm_fault *vmf) + struct block_device *bdev, sector_t sector, size_t size, + void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, mapping->host), - .size = bh->b_size, + .sector = sector, + .size = size, }; void *ret; void *entry = *entryp; @@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) - error = copy_user_bh(new_page, inode, &bh, vaddr); + error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), + bh.b_size, new_page, vaddr); else clear_user_highpage(new_page, vaddr); if (error) @@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* Filesystem should not return unwritten buffers to us! */ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), + bh.b_size, &entry, vma, vmf); unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: @@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); + +#ifdef CONFIG_FS_IOMAP +static loff_t +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *iter = data; + loff_t end = pos + length, done = 0; + ssize_t ret = 0; + + if (iov_iter_rw(iter) == READ) { + end = min(end, i_size_read(inode)); + if (pos >= end) + return 0; + + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return iov_iter_zero(min(length, end - pos), iter); + } + + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) + return -EIO; + + while (pos < end) { + unsigned offset = pos & (PAGE_SIZE - 1); + struct blk_dax_ctl dax = { 0 }; + ssize_t map_len; + + dax.sector = iomap->blkno + + (((pos & PAGE_MASK) - iomap->offset) >> 9); + dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; + map_len = dax_map_atomic(iomap->bdev, &dax); + if (map_len < 0) { + ret = map_len; + break; + } + + dax.addr += offset; + map_len -= offset; + if (map_len > end - pos) + map_len = end - pos; + + if (iov_iter_rw(iter) == WRITE) + map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + else + map_len = copy_to_iter(dax.addr, map_len, iter); + dax_unmap_atomic(iomap->bdev, &dax); + if (map_len <= 0) { + ret = map_len ? map_len : -EFAULT; + break; + } + + pos += map_len; + length -= map_len; + done += map_len; + } + + return done ? done : ret; +} + +/** + * iomap_dax_rw - Perform I/O to a DAX file + * @iocb: The control block for this I/O + * @iter: The addresses to do I/O from or to + * @ops: iomap ops passed from the file system + * + * This function performs read and write operations to directly mapped + * persistent memory. The callers needs to take care of read/write exclusion + * and evicting any page cache pages in the region under I/O. + */ +ssize_t +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, done = 0; + unsigned flags = 0; + + if (iov_iter_rw(iter) == WRITE) + flags |= IOMAP_WRITE; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + * + * XXX: This is racy against mmap, and there's nothing we can do about + * it. We'll eventually need to shift this down even further so that + * we can check if we allocated blocks over a hole first. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, + (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, + iter, iomap_dax_actor); + if (ret <= 0) + break; + pos += ret; + done += ret; + } + + iocb->ki_pos += done; + return done ? done : ret; +} +EXPORT_SYMBOL_GPL(iomap_dax_rw); + +/** + * iomap_dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @ops: iomap ops passed from the file system + * + * When a page fault occurs, filesystems may call this helper in their fault + * or mkwrite handler for DAX files. Assumes the caller has done all the + * necessary locking for the page fault to proceed successfully. + */ +int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; + sector_t sector; + struct iomap iomap = { 0 }; + unsigned flags = 0; + int error, major = 0; + void *entry; + + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ + if (pos >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; + } + + if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + flags |= IOMAP_WRITE; + + /* + * Note that we don't bother to use iomap_apply here: DAX required + * the file system block size to be equal the page size, which means + * that we never have to deal with more than a single extent here. + */ + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + if (error) + goto unlock_entry; + if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { + error = -EIO; /* fs corruption? */ + goto unlock_entry; + } + + sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + + if (vmf->cow_page) { + switch (iomap.type) { + case IOMAP_HOLE: + case IOMAP_UNWRITTEN: + clear_user_highpage(vmf->cow_page, vaddr); + break; + case IOMAP_MAPPED: + error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, + vmf->cow_page, vaddr); + break; + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + if (error) + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; + } + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (iomap.flags & IOMAP_F_NEW) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + } + error = dax_insert_mapping(mapping, iomap.bdev, sector, + PAGE_SIZE, &entry, vma, vmf); + break; + case IOMAP_UNWRITTEN: + case IOMAP_HOLE: + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return dax_load_hole(mapping, entry, vmf); + /*FALLTHRU*/ + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if (error < 0 && error != -EBUSY) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; +} +EXPORT_SYMBOL_GPL(iomap_dax_fault); +#endif /* CONFIG_FS_IOMAP */ diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 592059f88e04..354e2ab62031 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -97,9 +97,6 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish); #define F_DENTRY(filp) ((filp)->f_path.dentry) -#define REAL_FOPS_DEREF(dentry) \ - ((const struct file_operations *)(dentry)->d_fsdata) - static int open_proxy_open(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); @@ -112,7 +109,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -143,7 +140,7 @@ static ret_type full_proxy_ ## name(proto) \ { \ const struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops = \ - REAL_FOPS_DEREF(dentry); \ + debugfs_real_fops(filp); \ int srcu_idx; \ ret_type r; \ \ @@ -176,7 +173,7 @@ static unsigned int full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); int srcu_idx; unsigned int r = 0; @@ -193,7 +190,7 @@ static unsigned int full_proxy_poll(struct file *filp, static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; @@ -209,7 +206,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp) replace_fops(filp, d_inode(dentry)->i_fop); kfree((void *)proxy_fops); fops_put(real_fops); - return 0; + return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, @@ -241,7 +238,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index bba52634b995..b3e8443a1f47 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -19,8 +19,4 @@ extern const struct file_operations debugfs_noop_file_operations; extern const struct file_operations debugfs_open_proxy_file_operations; extern const struct file_operations debugfs_full_proxy_file_operations; -struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); - #endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 79a5941c2474..442d1a7e671b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb) struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - kuid_t root_uid; - kgid_t root_gid; - - root_uid = make_kuid(current_user_ns(), 0); - root_gid = make_kgid(current_user_ns(), 0); - if (!uid_valid(root_uid) || !gid_valid(root_gid)) - return -EINVAL; + kuid_t ptmx_uid = current_fsuid(); + kgid_t ptmx_gid = current_fsgid(); inode_lock(d_inode(root)); @@ -309,8 +304,8 @@ static int mknod_ptmx(struct super_block *sb) mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); - inode->i_uid = root_uid; - inode->i_gid = root_gid; + inode->i_uid = ptmx_uid; + inode->i_gid = ptmx_gid; d_add(dentry, inode); @@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* @@ -395,6 +389,7 @@ static int devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -403,10 +398,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; + error = -ENOMEM; s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; + error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); + if (error) + goto fail; + + error = -ENOMEM; inode = new_inode(s); if (!inode) goto fail; @@ -418,13 +419,21 @@ devpts_fill_super(struct super_block *s, void *data, int silent) set_nlink(inode, 2); s->s_root = d_make_root(inode); - if (s->s_root) - return 0; + if (!s->s_root) { + pr_err("get root dentry failed\n"); + goto fail; + } - pr_err("get root dentry failed\n"); + error = mknod_ptmx(s); + if (error) + goto fail_dput; + return 0; +fail_dput: + dput(s->s_root); + s->s_root = NULL; fail: - return -ENOMEM; + return error; } /* @@ -436,43 +445,15 @@ fail: static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int error; - struct pts_mount_opts opts; - struct super_block *s; - - error = parse_mount_options(data, PARSE_MOUNT, &opts); - if (error) - return ERR_PTR(error); - - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (!s->s_root) { - error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) - goto out_undo_sget; - s->s_flags |= MS_ACTIVE; - } - - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); - - error = mknod_ptmx(s); - if (error) - goto out_undo_sget; - - return dget(s->s_root); - -out_undo_sget: - deactivate_locked_super(s); - return ERR_PTR(error); + return mount_nodev(fs_type, flags, data, devpts_fill_super); } static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); - ida_destroy(&fsi->allocated_ptys); + if (fsi) + ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_litter_super(sb); } diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 1d73fc6dba13..cbb50cadcffc 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -105,7 +105,10 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, inode->i_private = var; - efivar_entry_add(var, &efivarfs_list); + err = efivar_entry_add(var, &efivarfs_list); + if (err) + goto out; + d_instantiate(dentry, inode); dget(dentry); out: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 688ccc16b702..d7a7c53803c1 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -157,12 +157,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, goto fail_inode; } + efivar_entry_size(entry, &size); + err = efivar_entry_add(entry, &efivarfs_list); + if (err) + goto fail_inode; + /* copied by the above to local storage in the dentry. */ kfree(name); - efivar_entry_size(entry, &size); - efivar_entry_add(entry, &efivarfs_list); - inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); @@ -182,7 +184,10 @@ fail: static int efivarfs_destroy(struct efivar_entry *entry, void *data) { - efivar_entry_remove(entry); + int err = efivar_entry_remove(entry); + + if (err) + return err; kfree(entry); return 0; } diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index c634874e12d9..36bea5adcaba 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,5 +1,6 @@ config EXT2_FS tristate "Second extended fs support" + select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 06af2f92226c..37e2be784ac7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; +extern struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5efeefe17abb..423cc01c9d41 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -22,11 +22,59 @@ #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> +#include <linux/iomap.h> +#include <linux/uio.h> #include "ext2.h" #include "xattr.h" #include "acl.h" #ifdef CONFIG_FS_DAX +static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + ssize_t ret; + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + + inode_lock_shared(inode); + ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} + +static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = file_update_time(file); + if (ret) + goto out_unlock; + + ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + mark_inode_dirty(inode); + } + +out_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + /* * The lock ordering for ext2 DAX fault paths is: * @@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_fault(vma, vmf, ext2_get_block); + ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } -/* - * We have mostly NULL's here: the current defaults are ok for - * the ext2 filesystem. - */ +static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + +static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_write_iter(iocb, from); +#endif + return generic_file_write_iter(iocb, from); +} + const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, + .read_iter = ext2_file_read_iter, + .write_iter = ext2_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index efe5fb21c533..04e73a99902b 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -465,6 +465,11 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode, for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext2_get_group_desc(sb, group, &bh2); + if (!gdp) { + if (++group == sbi->s_groups_count) + group = 0; + continue; + } brelse(bitmap_bh); bitmap_bh = read_inode_bitmap(sb, group); if (!bitmap_bh) { diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d5c7d09919f3..1e72d425fd3b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -32,6 +32,7 @@ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> #include "ext2.h" @@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode, */ static int ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, + u32 *bno, bool *new, bool *boundary, int create) { int err = -EIO; @@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); /* What's this do? */ count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { @@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; - clear_buffer_new(bh_result); goto got_it; } } @@ -733,6 +732,16 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { + int i; + + /* + * We must unmap blocks before zeroing so that writeback cannot + * overwrite zeros with stale data from block device page cache. + */ + for (i = 0; i < count; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) + i); + } /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's @@ -745,15 +754,16 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } else - set_buffer_new(bh_result); + } else { + *new = true; + } ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); + *boundary = true; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ @@ -765,19 +775,82 @@ cleanup: return err; } -int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +int ext2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - int ret = ext2_get_blocks(inode, iblock, max_blocks, - bh_result, create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary, + create); + if (ret <= 0) + return ret; + + map_bh(bh_result, inode->i_sb, bno); + bh_result->b_size = (ret << inode->i_blkbits); + if (new) + set_buffer_new(bh_result); + if (boundary) + set_buffer_boundary(bh_result); + return 0; + +} + +#ifdef CONFIG_FS_DAX +static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) +{ + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, first_block, max_blocks, + &bno, &new, &boundary, flags & IOMAP_WRITE); + if (ret < 0) + return ret; + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = (u64)first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = 1 << blkbits; + } else { + iomap->type = IOMAP_MAPPED; + iomap->blkno = (sector_t)bno << (blkbits - 9); + iomap->length = (u64)ret << blkbits; + iomap->flags |= IOMAP_F_MERGED; } - return ret; + if (new) + iomap->flags |= IOMAP_F_NEW; + return 0; } +static int +ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ + if (iomap->type == IOMAP_MAPPED && + written < length && + (flags & IOMAP_WRITE)) + ext2_write_failed(inode->i_mapping, offset + length); + return 0; +} + +struct iomap_ops ext2_iomap_ops = { + .iomap_begin = ext2_iomap_begin, + .iomap_end = ext2_iomap_end, +}; +#endif /* CONFIG_FS_DAX */ + int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -863,11 +936,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, - DIO_LOCKING); - else - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); + if (WARN_ON_ONCE(IS_DAX(inode))) + return -EIO; + + ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); return ret; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 10686fd67fb4..1bb7df5e4536 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -776,7 +776,7 @@ resizefs_out: (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - return fscrypt_process_policy(inode, &policy); + return fscrypt_process_policy(filp, &policy); #else return -EOPNOTSUPP; #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 47abb96098e4..28f4f4cbb8d8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - ret = fscrypt_process_policy(inode, &policy); - mnt_drop_write_file(filp); - return ret; + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 1b2f6c2c3aaf..76f09ce7e5b2 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -1,5 +1,6 @@ config FUSE_FS tristate "FUSE (Filesystem in Userspace) support" + select FS_POSIX_ACL help With FUSE it is possible to implement a fully functional filesystem in a userspace program. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index e95eeb445e58..60da84a86dab 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c new file mode 100644 index 000000000000..ec85765502f1 --- /dev/null +++ b/fs/fuse/acl.c @@ -0,0 +1,99 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com> + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> + +struct posix_acl *fuse_get_acl(struct inode *inode, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int size; + const char *name; + void *value = NULL; + struct posix_acl *acl; + + if (!fc->posix_acl || fc->no_getxattr) + return NULL; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return ERR_PTR(-EOPNOTSUPP); + + value = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = fuse_getxattr(inode, name, value, PAGE_SIZE); + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if ((size == 0) || (size == -ENODATA) || + (size == -EOPNOTSUPP && fc->no_getxattr)) + acl = NULL; + else if (size == -ERANGE) + acl = ERR_PTR(-E2BIG); + else + acl = ERR_PTR(size); + + kfree(value); + return acl; +} + +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + const char *name; + int ret; + + if (!fc->posix_acl || fc->no_setxattr) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return -EINVAL; + + if (acl) { + /* + * Fuse userspace is responsible for updating access + * permissions in the inode, if needed. fuse_setxattr + * invalidates the inode attributes, which will force + * them to be refreshed the next time they are used, + * and it also updates i_ctime. + */ + size_t size = posix_acl_xattr_size(acl->a_count); + void *value; + + if (size > PAGE_SIZE) + return -E2BIG; + + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) { + kfree(value); + return ret; + } + + ret = fuse_setxattr(inode, name, value, size, 0); + kfree(value); + } else { + ret = fuse_removexattr(inode, name); + } + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + + return ret; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a94d2ed81ab4..c41bde26c338 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -767,7 +767,6 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->len = err; cs->offset = off; cs->pg = page; - cs->offset = off; iov_iter_advance(cs->iter, err); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c47b7780ce37..f7c84ab835ca 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -13,6 +13,8 @@ #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { @@ -37,47 +39,39 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 +union fuse_dentry { + u64 time; + struct rcu_head rcu; +}; + static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { - entry->d_time = time; + ((union fuse_dentry *) entry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(struct dentry *entry) { - return entry->d_time; -} -#else -/* - * On 32 bit archs store the high 32 bits of time in d_fsdata - */ -static void fuse_dentry_settime(struct dentry *entry, u64 time) -{ - entry->d_time = time; - entry->d_fsdata = (void *) (unsigned long) (time >> 32); -} - -static u64 fuse_dentry_time(struct dentry *entry) -{ - return (u64) entry->d_time + - ((u64) (unsigned long) entry->d_fsdata << 32); + return ((union fuse_dentry *) entry->d_fsdata)->time; } -#endif /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in - * dentry->d_time and fuse_inode->i_time respectively. + * dentry->d_fsdata and fuse_inode->i_time respectively. */ /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) +static u64 time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { - struct timespec ts = {sec, nsec}; - return get_jiffies_64() + timespec_to_jiffies(&ts); + struct timespec64 ts = { + sec, + max_t(u32, nsec, NSEC_PER_SEC - 1) + }; + + return get_jiffies_64() + timespec64_to_jiffies(&ts); } else return 0; } @@ -243,6 +237,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; + forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, entry_attr_timeout(&outarg), attr_version); @@ -272,8 +267,23 @@ static int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } +static int fuse_dentry_init(struct dentry *dentry) +{ + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + + return dentry->d_fsdata ? 0 : -ENOMEM; +} +static void fuse_dentry_release(struct dentry *dentry) +{ + union fuse_dentry *fd = dentry->d_fsdata; + + kfree_rcu(fd, rcu); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_init = fuse_dentry_init, + .d_release = fuse_dentry_release, }; int fuse_valid_type(int m) @@ -634,7 +644,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, &args, dir, entry, S_IFLNK); } -static inline void fuse_update_ctime(struct inode *inode) +void fuse_update_ctime(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_fs_time(inode->i_sb); @@ -917,6 +927,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, if (time_before64(fi->i_time, get_jiffies_64())) { r = true; + forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else { r = false; @@ -1017,7 +1028,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) return 1; cred = current_cred(); @@ -1064,6 +1075,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) if (mask & MAY_NOT_BLOCK) return -ECHILD; + forget_all_cached_acls(inode); return fuse_do_getattr(inode, NULL, NULL); } @@ -1092,7 +1104,7 @@ static int fuse_permission(struct inode *inode, int mask) /* * If attributes are needed, refresh them before proceeding */ - if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || + if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -1105,7 +1117,7 @@ static int fuse_permission(struct inode *inode, int mask) } } - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { + if (fc->default_permissions) { err = generic_permission(inode, mask); /* If permission is denied, try to refresh file @@ -1233,6 +1245,7 @@ retry: fi->nlookup++; spin_unlock(&fc->lock); + forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), attr_version); @@ -1605,7 +1618,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; err = inode_change_ok(inode, attr); @@ -1702,172 +1715,75 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); - - if (!fuse_allow_current_process(get_fuse_conn(inode))) - return -EACCES; - - if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(inode, attr, attr->ia_file); - else - return fuse_do_setattr(inode, attr, NULL); -} - -static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, - struct kstat *stat) -{ - struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); + struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; + int ret; - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; - return fuse_update_attributes(inode, stat, NULL, NULL); -} - -static int fuse_setxattr(struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_setxattr_in inarg; - int err; - - if (fc->no_setxattr) - return -EOPNOTSUPP; + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { + attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | + ATTR_MODE); - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - inarg.flags = flags; - args.in.h.opcode = FUSE_SETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 3; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - args.in.args[2].size = size; - args.in.args[2].value = value; - err = fuse_simple_request(fc, &args); - if (err == -ENOSYS) { - fc->no_setxattr = 1; - err = -EOPNOTSUPP; - } - if (!err) { - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); + /* + * The only sane way to reliably kill suid/sgid is to do it in + * the userspace filesystem + * + * This should be done on write(), truncate() and chown(). + */ + if (!fc->handle_killpriv) { + int kill; + + /* + * ia_mode calculation may have used stale i_mode. + * Refresh and recalculate. + */ + ret = fuse_do_getattr(inode, NULL, file); + if (ret) + return ret; + + attr->ia_mode = inode->i_mode; + kill = should_remove_suid(entry); + if (kill & ATTR_KILL_SUID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISUID; + } + if (kill & ATTR_KILL_SGID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISGID; + } + } } - return err; -} - -static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode, - const char *name, void *value, size_t size) -{ - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_getxattr_in inarg; - struct fuse_getxattr_out outarg; - ssize_t ret; + if (!attr->ia_valid) + return 0; - if (fc->no_getxattr) - return -EOPNOTSUPP; + ret = fuse_do_setattr(inode, attr, file); + if (!ret) { + /* + * If filesystem supports acls it may have updated acl xattrs in + * the filesystem, so forget cached acls for the inode. + */ + if (fc->posix_acl) + forget_all_cached_acls(inode); - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - args.in.h.opcode = FUSE_GETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - /* This is really two different operations rolled into one */ - args.out.numargs = 1; - if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = value; - } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - } - ret = fuse_simple_request(fc, &args); - if (!ret && !size) - ret = outarg.size; - if (ret == -ENOSYS) { - fc->no_getxattr = 1; - ret = -EOPNOTSUPP; + /* Directory mode changed, may need to revalidate access */ + if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) + fuse_invalidate_entry_cache(entry); } return ret; } -static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, + struct kstat *stat) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_getxattr_in inarg; - struct fuse_getxattr_out outarg; - ssize_t ret; if (!fuse_allow_current_process(fc)) return -EACCES; - if (fc->no_listxattr) - return -EOPNOTSUPP; - - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - args.in.h.opcode = FUSE_LISTXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - /* This is really two different operations rolled into one */ - args.out.numargs = 1; - if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = list; - } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - } - ret = fuse_simple_request(fc, &args); - if (!ret && !size) - ret = outarg.size; - if (ret == -ENOSYS) { - fc->no_listxattr = 1; - ret = -EOPNOTSUPP; - } - return ret; -} - -static int fuse_removexattr(struct dentry *entry, const char *name) -{ - struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - int err; - - if (fc->no_removexattr) - return -EOPNOTSUPP; - - args.in.h.opcode = FUSE_REMOVEXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = strlen(name) + 1; - args.in.args[0].value = name; - err = fuse_simple_request(fc, &args); - if (err == -ENOSYS) { - fc->no_removexattr = 1; - err = -EOPNOTSUPP; - } - if (!err) { - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); - } - return err; + return fuse_update_attributes(inode, stat, NULL, NULL); } static const struct inode_operations fuse_dir_inode_operations = { @@ -1884,10 +1800,12 @@ static const struct inode_operations fuse_dir_inode_operations = { .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .removexattr = generic_removexattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct file_operations fuse_dir_operations = { @@ -1905,10 +1823,12 @@ static const struct inode_operations fuse_common_inode_operations = { .setattr = fuse_setattr, .permission = fuse_permission, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .removexattr = generic_removexattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct inode_operations fuse_symlink_inode_operations = { @@ -1916,10 +1836,10 @@ static const struct inode_operations fuse_symlink_inode_operations = { .get_link = fuse_get_link, .readlink = generic_readlink, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .removexattr = generic_removexattr, }; void fuse_init_common(struct inode *inode) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f394aff59c36..b7beb67bf005 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, int write) +static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; - if (write) + if (should_dirty) set_page_dirty_lock(page); put_page(page); } @@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; + bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, !write); + fuse_release_user_pages(req, should_dirty); if (req->out.h.error) { err = req->out.h.error; break; @@ -2325,49 +2326,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) return retval; } -static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, - unsigned int nr_segs, size_t bytes, bool to_user) -{ - struct iov_iter ii; - int page_idx = 0; - - if (!bytes) - return 0; - - iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes); - - while (iov_iter_count(&ii)) { - struct page *page = pages[page_idx++]; - size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); - void *kaddr; - - kaddr = kmap(page); - - while (todo) { - char __user *uaddr = ii.iov->iov_base + ii.iov_offset; - size_t iov_len = ii.iov->iov_len - ii.iov_offset; - size_t copy = min(todo, iov_len); - size_t left; - - if (!to_user) - left = copy_from_user(kaddr, uaddr, copy); - else - left = copy_to_user(uaddr, kaddr, copy); - - if (unlikely(left)) - return -EFAULT; - - iov_iter_advance(&ii, copy); - todo -= copy; - kaddr += copy; - } - - kunmap(page); - } - - return 0; -} - /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit @@ -2519,8 +2477,9 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred; - int err; + size_t in_size, out_size, transferred, c; + int err, i; + struct iov_iter ii; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2602,10 +2561,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, req->in.args[1].size = in_size; req->in.argpages = 1; - err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, - false); - if (err) - goto out; + err = -EFAULT; + iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } } req->out.numargs = 2; @@ -2671,7 +2633,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, if (transferred > inarg.out_size) goto out; - err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + err = -EFAULT; + iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + err = 0; out: if (req) fuse_put_request(fc, req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d98d8cc84def..24ada5dc4dae 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -23,6 +23,7 @@ #include <linux/poll.h> #include <linux/workqueue.h> #include <linux/kref.h> +#include <linux/xattr.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -36,15 +37,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem - module will check permissions based on the file mode. Otherwise no - permission checking is done in the kernel */ -#define FUSE_DEFAULT_PERMISSIONS (1 << 0) - -/** If the FUSE_ALLOW_OTHER flag is given, then not only the user - doing the mount will be allowed to access the filesystem */ -#define FUSE_ALLOW_OTHER (1 << 1) - /** Number of page pointers embedded in fuse_req */ #define FUSE_REQ_INLINE_PAGES 1 @@ -469,9 +461,6 @@ struct fuse_conn { /** The group id for this mount */ kgid_t group_id; - /** The fuse mount flags for this mount */ - unsigned flags; - /** Maximum read size */ unsigned max_read; @@ -547,6 +536,9 @@ struct fuse_conn { /** allow parallel lookups and readdir (default is serialized) */ unsigned parallel_dirops:1; + /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ + unsigned handle_killpriv:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -624,6 +616,15 @@ struct fuse_conn { /** Is lseek not implemented by fs? */ unsigned no_lseek:1; + /** Does the filesystem support posix acls? */ + unsigned posix_acl:1; + + /** Check permissions based on the file mode or not? */ + unsigned default_permissions:1; + + /** Allow other than the mounter user to access the filesystem ? */ + unsigned allow_other:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -902,6 +903,8 @@ int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); +void fuse_update_ctime(struct inode *inode); + int fuse_update_attributes(struct inode *inode, struct kstat *stat, struct file *file, bool *refreshed); @@ -966,4 +969,17 @@ void fuse_set_initialized(struct fuse_conn *fc); void fuse_unlock_inode(struct inode *inode); void fuse_lock_inode(struct inode *inode); +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags); +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size); +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); +int fuse_removexattr(struct inode *inode, const char *name); +extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler *fuse_acl_xattr_handlers[]; + +struct posix_acl; +struct posix_acl *fuse_get_acl(struct inode *inode, int type); +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4e05b51120f4..17141099f2e7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -20,6 +20,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> +#include <linux/posix_acl.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -66,7 +67,8 @@ struct fuse_mount_data { unsigned rootmode_present:1; unsigned user_id_present:1; unsigned group_id_present:1; - unsigned flags; + unsigned default_permissions:1; + unsigned allow_other:1; unsigned max_read; unsigned blksize; }; @@ -192,7 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, * check in may_delete(). */ fi->orig_i_mode = inode->i_mode; - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; @@ -340,6 +342,7 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, return -ENOENT; fuse_invalidate_attr(inode); + forget_all_cached_acls(inode); if (offset >= 0) { pg_start = offset >> PAGE_SHIFT; if (len <= 0) @@ -532,11 +535,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_DEFAULT_PERMISSIONS: - d->flags |= FUSE_DEFAULT_PERMISSIONS; + d->default_permissions = 1; break; case OPT_ALLOW_OTHER: - d->flags |= FUSE_ALLOW_OTHER; + d->allow_other = 1; break; case OPT_MAX_READ: @@ -570,9 +573,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) + if (fc->default_permissions) seq_puts(m, ",default_permissions"); - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) seq_puts(m, ",allow_other"); if (fc->max_read != ~0) seq_printf(m, ",max_read=%u", fc->max_read); @@ -910,8 +913,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->writeback_cache = 1; if (arg->flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; + if (arg->flags & FUSE_HANDLE_KILLPRIV) + fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; + if ((arg->flags & FUSE_POSIX_ACL)) { + fc->default_permissions = 1; + fc->posix_acl = 1; + fc->sb->s_xattr = fuse_acl_xattr_handlers; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -941,7 +951,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1071,6 +1081,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) } sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; @@ -1109,7 +1120,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= MS_POSIXACL; - fc->flags = d.flags; + fc->default_permissions = d.default_permissions; + fc->allow_other = d.allow_other; fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c new file mode 100644 index 000000000000..3caac46b08b0 --- /dev/null +++ b/fs/fuse/xattr.c @@ -0,0 +1,211 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2016 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> + +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_setxattr_in inarg; + int err; + + if (fc->no_setxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + inarg.flags = flags; + args.in.h.opcode = FUSE_SETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 3; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + args.in.args[2].size = size; + args.in.args[2].value = value; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_setxattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fc->no_getxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_GETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = value; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +static int fuse_verify_xattr_list(char *list, size_t size) +{ + size_t origsize = size; + + while (size) { + size_t thislen = strnlen(list, size); + + if (!thislen || thislen == size) + return -EIO; + + size -= thislen + 1; + list += thislen + 1; + } + + return origsize; +} + +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (fc->no_listxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_LISTXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = list; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + if (ret > 0 && size) + ret = fuse_verify_xattr_list(list, ret); + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +int fuse_removexattr(struct inode *inode, const char *name) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + int err; + + if (fc->no_removexattr) + return -EOPNOTSUPP; + + args.in.h.opcode = FUSE_REMOVEXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = strlen(name) + 1; + args.in.args[0].value = name; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_removexattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +static int fuse_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + return fuse_getxattr(inode, name, value, size); +} + +static int fuse_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + if (!value) + return fuse_removexattr(inode, name); + + return fuse_setxattr(inode, name, value, size, flags); +} + +static const struct xattr_handler fuse_xattr_handler = { + .prefix = "", + .get = fuse_xattr_get, + .set = fuse_xattr_set, +}; + +const struct xattr_handler *fuse_xattr_handlers[] = { + &fuse_xattr_handler, + NULL +}; + +const struct xattr_handler *fuse_acl_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 82df36886938..5a6f52ea2722 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -187,7 +187,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w ClearPageChecked(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + BIT(BH_Dirty)|BIT(BH_Uptodate)); } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); } @@ -1147,6 +1147,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) if (!page_has_buffers(page)) return 0; + /* + * From xfs_vm_releasepage: mm accommodates an old ext3 case where + * clean pages might not have had the dirty bit cleared. Thus, it can + * send actual dirty pages to ->releasepage() via shrink_active_list(). + * + * As a workaround, we skip pages that contain dirty buffers below. + * Once ->releasepage isn't called on dirty pages anymore, we can warn + * on dirty buffers like we used to here again. + */ + gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); @@ -1156,8 +1166,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bd = bh->b_private; if (bd && bd->bd_tr) goto cannot_release; - if (buffer_pinned(bh) || buffer_dirty(bh)) - goto not_possible; + if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh))) + goto cannot_release; bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); @@ -1180,9 +1190,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return try_to_free_buffers(page); -not_possible: /* Should never happen */ - WARN_ON(buffer_dirty(bh)); - WARN_ON(buffer_pinned(bh)); cannot_release: spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6e2bec1cd289..645721f3ff00 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -82,8 +82,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, } if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, - (1 << BH_Uptodate)); + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); bh = page_buffers(page); @@ -690,7 +690,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi BUG_ON(!dblock); BUG_ON(!new); - bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); + bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5)); ret = gfs2_block_map(inode, lblock, &bh, create); *extlen = bh.b_size >> inode->i_blkbits; *dblock = bh.b_blocknr; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index fcb59b23f1e3..db8fbeb62483 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -351,7 +351,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) if (hc) return hc; - hsize = 1 << ip->i_depth; + hsize = BIT(ip->i_depth); hsize *= sizeof(__be64); if (hsize != i_size_read(&ip->i_inode)) { gfs2_consist_inode(ip); @@ -819,8 +819,8 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, if (ip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; - unsigned hsize = 1 << ip->i_depth; - unsigned index; + unsigned int hsize = BIT(ip->i_depth); + unsigned int index; u64 ln; if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); @@ -932,7 +932,7 @@ static int dir_make_exhash(struct inode *inode) return -ENOSPC; bn = bh->b_blocknr; - gfs2_assert(sdp, dip->i_entries < (1 << 16)); + gfs2_assert(sdp, dip->i_entries < BIT(16)); leaf->lf_entries = cpu_to_be16(dip->i_entries); /* Copy dirents */ @@ -1041,7 +1041,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) bn = nbh->b_blocknr; /* Compute the start and len of leaf pointers in the hash table. */ - len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { pr_warn("i_depth %u lf_depth %u index %u\n", @@ -1163,7 +1163,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) int x; int error = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hsize_bytes = hsize * sizeof(__be64); hc = gfs2_dir_get_hash_table(dip); @@ -1539,7 +1539,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, int error = 0; unsigned depth = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hash = gfs2_dir_offset2hash(ctx->pos); index = hash >> (32 - dip->i_depth); @@ -1558,7 +1558,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, if (error) break; - len = 1 << (dip->i_depth - depth); + len = BIT(dip->i_depth - depth); index = (index & ~(len - 1)) + len; } @@ -2113,7 +2113,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) u64 leaf_no; int error = 0, last; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); lp = gfs2_dir_get_hash_table(dip); if (IS_ERR(lp)) @@ -2126,7 +2126,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) if (error) goto out; leaf = (struct gfs2_leaf *)bh->b_data; - len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(leaf->lf_depth)); next_index = (index & ~(len - 1)) + len; last = ((next_index >= hsize) ? 1 : 0); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 320e65e61938..360188f162bd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -395,9 +395,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); - /* Update file times before taking page lock */ - file_update_time(vma->vm_file); - ret = gfs2_rsqa_alloc(ip); if (ret) goto out; @@ -409,6 +406,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_uninit; + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 3a90b2b5b9bb..14cbf60167a7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -69,7 +69,7 @@ static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 -#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) static struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, @@ -1781,7 +1781,13 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - register_shrinker(&glock_shrinker); + ret = register_shrinker(&glock_shrinker); + if (ret) { + destroy_workqueue(gfs2_delete_workqueue); + destroy_workqueue(glock_workqueue); + rhashtable_destroy(&gl_hash_table); + return ret; + } return 0; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e4da0ecd3285..fb3a810b506f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -187,6 +187,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } gfs2_set_iop(inode); + + inode->i_atime.tv_sec = 0; + inode->i_atime.tv_nsec = 0; + unlock_new_inode(inode); } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 7710dfd3af35..aace8ce34a18 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -85,7 +85,7 @@ static inline int gfs2_check_internal_file_size(struct inode *inode, u64 size = i_size_read(inode); if (size < minsize || size > maxsize) goto err; - if (size & ((1 << inode->i_blkbits) - 1)) + if (size & (BIT(inode->i_blkbits) - 1)) goto err; return 0; err: diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 74fd0139e6c2..67d1fc4668f7 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -145,7 +145,9 @@ static int __init init_gfs2_fs(void) if (!gfs2_qadata_cachep) goto fail; - register_shrinker(&gfs2_qd_shrinker); + error = register_shrinker(&gfs2_qd_shrinker); + if (error) + goto fail; error = register_filesystem(&gfs2_fs_type); if (error) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 950b8be68e41..373639a59782 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -216,23 +216,26 @@ static void gfs2_meta_read_endio(struct bio *bio) static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], int num) { - struct buffer_head *bh = bhs[0]; - struct bio *bio; - int i; - - if (!num) - return; - - bio = bio_alloc(GFP_NOIO, num); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - for (i = 0; i < num; i++) { - bh = bhs[i]; - bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + while (num > 0) { + struct buffer_head *bh = *bhs; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + while (num > 0) { + bh = *bhs; + if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + BUG_ON(bio->bi_iter.bi_size == 0); + break; + } + bhs++; + num--; + } + bio->bi_end_io = gfs2_meta_read_endio; + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); } - bio->bi_end_io = gfs2_meta_read_endio; - bio_set_op_attrs(bio, op, op_flags); - submit_bio(bio); } /** diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ef1e1822977f..ff72ac6439c8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -58,7 +58,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; gt->gt_new_files_jdata = 0; - gt->gt_max_readahead = 1 << 18; + gt->gt_max_readahead = BIT(18); gt->gt_complain_secs = 10; } @@ -284,7 +284,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) / sizeof(u64); sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - @@ -302,7 +302,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) /* Compute maximum reservation required to add a entry to a directory */ - hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH), sdp->sd_jbsize); ind_blocks = 0; @@ -1089,7 +1089,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 77930ca25303..8af2dfa09236 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -75,7 +75,7 @@ #include "util.h" #define GFS2_QD_HASH_SHIFT 12 -#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) #define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ @@ -384,7 +384,7 @@ static int bh_get(struct gfs2_quota_data *qd) block = qd->qd_slot / sdp->sd_qc_per_block; offset = qd->qd_slot % sdp->sd_qc_per_block; - bh_map.b_size = 1 << ip->i_inode.i_blkbits; + bh_map.b_size = BIT(ip->i_inode.i_blkbits); error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 3a7e60bb39f8..e3ee387a6dfe 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -359,7 +359,7 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); u64 size = i_size_read(jd->jd_inode); - if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30))) return -EIO; jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; diff --git a/fs/internal.h b/fs/internal.h index ba0737649d4a..859178692ce4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -12,6 +12,7 @@ struct super_block; struct file_system_type; struct iomap; +struct iomap_ops; struct linux_binprm; struct path; struct mount; @@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations; extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, unsigned long arg); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* + * iomap support: + */ +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap_ops *ops, void *data, + iomap_actor_t actor); diff --git a/fs/ioctl.c b/fs/ioctl.c index 0f56deb24ce6..c415668c86d4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static long ioctl_file_dedupe_range(struct file *file, void __user *arg) +static int ioctl_file_dedupe_range(struct file *file, void __user *arg) { struct file_dedupe_range __user *argp = arg; struct file_dedupe_range *same = NULL; @@ -582,6 +582,10 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg) } size = offsetof(struct file_dedupe_range __user, info[count]); + if (size > PAGE_SIZE) { + ret = -ENOMEM; + goto out; + } same = memdup_user(argp, size); if (IS_ERR(same)) { diff --git a/fs/iomap.c b/fs/iomap.c index 706270f21b35..013d1d36fbbf 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -27,9 +27,6 @@ #include <linux/dax.h> #include "internal.h" -typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); - /* * Execute a iomap write on a segment of the mapping that spans a * contiguous range of pages that have identical block mapping state. @@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, * resources they require in the iomap_begin call, and release them in the * iomap_end call. */ -static loff_t +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor) { @@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static struct page * +__iomap_read_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +static loff_t +iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + long status = 0; + ssize_t written = 0; + + do { + struct page *page, *rpage; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + + rpage = __iomap_read_page(inode, pos); + if (IS_ERR(rpage)) + return PTR_ERR(rpage); + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, + &page, iomap); + put_page(rpage); + if (unlikely(status)) + return status; + + WARN_ON_ONCE(!PageUptodate(page)); + + status = iomap_write_end(inode, pos, bytes, bytes, page); + if (unlikely(status <= 0)) { + if (WARN_ON_ONCE(status == 0)) + return -EIO; + return status; + } + + cond_resched(); + + pos += status; + written += status; + length -= status; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (length); + + return written; +} + +int +iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len) { + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, + iomap_dirty_actor); + if (ret <= 0) + return ret; + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_file_dirty); + static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { @@ -430,6 +509,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; + if (iomap->flags & IOMAP_F_SHARED) + flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 2e58978d6f45..4d973524c887 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2893,8 +2893,7 @@ restart: * on anon_list2. Let's check. */ if (!list_empty(&TxAnchor.anon_list2)) { - list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); - INIT_LIST_HEAD(&TxAnchor.anon_list2); + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); goto restart; } TXN_UNLOCK(); diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 90b3bc21e9b0..bd9b641ada2c 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -379,8 +379,14 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * cached in meta-data cache, and not written out * by txCommit(); */ - filemap_fdatawait(ipbmap->i_mapping); - filemap_write_and_wait(ipbmap->i_mapping); + rc = filemap_fdatawait(ipbmap->i_mapping); + if (rc) + goto error_out; + + rc = filemap_write_and_wait(ipbmap->i_mapping); + if (rc) + goto error_out; + diWriteSpecial(ipbmap, 0); newPage = nPages; /* first new page number */ diff --git a/fs/locks.c b/fs/locks.c index ee1b15f6fc13..90ec67108b22 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -127,7 +127,6 @@ #include <linux/pid_namespace.h> #include <linux/hashtable.h> #include <linux/percpu.h> -#include <linux/lglock.h> #define CREATE_TRACE_POINTS #include <trace/events/filelock.h> @@ -158,12 +157,18 @@ int lease_break_time = 45; /* * The global file_lock_list is only used for displaying /proc/locks, so we - * keep a list on each CPU, with each list protected by its own spinlock via - * the file_lock_lglock. Note that alterations to the list also require that - * the relevant flc_lock is held. + * keep a list on each CPU, with each list protected by its own spinlock. + * Global serialization is done using file_rwsem. + * + * Note that alterations to the list also require that the relevant flc_lock is + * held. */ -DEFINE_STATIC_LGLOCK(file_lock_lglock); -static DEFINE_PER_CPU(struct hlist_head, file_lock_list); +struct file_lock_list_struct { + spinlock_t lock; + struct hlist_head hlist; +}; +static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); +DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. @@ -587,15 +592,23 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) /* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock *fl) { - lg_local_lock(&file_lock_lglock); + struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); + + percpu_rwsem_assert_held(&file_rwsem); + + spin_lock(&fll->lock); fl->fl_link_cpu = smp_processor_id(); - hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); - lg_local_unlock(&file_lock_lglock); + hlist_add_head(&fl->fl_link, &fll->hlist); + spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock *fl) { + struct file_lock_list_struct *fll; + + percpu_rwsem_assert_held(&file_rwsem); + /* * Avoid taking lock if already unhashed. This is safe since this check * is done while holding the flc_lock, and new insertions into the list @@ -603,9 +616,11 @@ static void locks_delete_global_locks(struct file_lock *fl) */ if (hlist_unhashed(&fl->fl_link)) return; - lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + + fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu); + spin_lock(&fll->lock); hlist_del_init(&fl->fl_link); - lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); + spin_unlock(&fll->lock); } static unsigned long @@ -915,6 +930,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) return -ENOMEM; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -955,6 +971,7 @@ find_conflict: out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -991,6 +1008,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If @@ -1162,6 +1180,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); /* * Free any unused locks. */ @@ -1436,6 +1455,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1487,9 +1507,13 @@ restart: locks_insert_block(fl, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); + + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); @@ -1506,6 +1530,7 @@ restart: } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1660,6 +1685,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg, lease->fl_flags); @@ -1730,6 +1756,7 @@ out_setup: lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); @@ -1752,6 +1779,7 @@ static int generic_delete_lease(struct file *filp, void *owner) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file == filp && @@ -1764,6 +1792,7 @@ static int generic_delete_lease(struct file *filp, void *owner) if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); return error; } @@ -2574,9 +2603,20 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, struct inode *inode = NULL; unsigned int fl_pid; - if (fl->fl_nspid) - fl_pid = pid_vnr(fl->fl_nspid); - else + if (fl->fl_nspid) { + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; + + /* Don't let fl_pid change based on who is reading the file */ + fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns); + + /* + * If there isn't a fl_pid don't display who is waiting on + * the lock if we are called from locks_show, or if we are + * called from __show_fd_info - skip lock entirely + */ + if (fl_pid == 0) + return; + } else fl_pid = fl->fl_pid; if (fl->fl_file != NULL) @@ -2648,9 +2688,13 @@ static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; fl = hlist_entry(v, struct file_lock, fl_link); + if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns)) + return 0; + lock_get_status(f, fl, iter->li_pos, ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) @@ -2703,9 +2747,9 @@ static void *locks_start(struct seq_file *f, loff_t *pos) struct locks_iterator *iter = f->private; iter->li_pos = *pos + 1; - lg_global_lock(&file_lock_lglock); + percpu_down_write(&file_rwsem); spin_lock(&blocked_lock_lock); - return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); + return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) @@ -2713,14 +2757,14 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) struct locks_iterator *iter = f->private; ++iter->li_pos; - return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); + return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); - lg_global_unlock(&file_lock_lglock); + percpu_up_write(&file_rwsem); } static const struct seq_operations locks_seq_operations = { @@ -2761,10 +2805,13 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - lg_lock_init(&file_lock_lglock, "file_lock_lglock"); - for_each_possible_cpu(i) - INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + for_each_possible_cpu(i) { + struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); + + spin_lock_init(&fll->lock); + INIT_HLIST_HEAD(&fll->hlist); + } return 0; } diff --git a/fs/mount.h b/fs/mount.h index 14db05d424f7..d2e25d7b64b3 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,9 +10,12 @@ struct mnt_namespace { struct mount * root; struct list_head list; struct user_namespace *user_ns; + struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; + unsigned int mounts; /* # of mounts in the namespace */ + unsigned int pending_mounts; }; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda3bfef..db1b5a38864e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,9 @@ #include "pnode.h" #include "internal.h" +/* Maximum number of mounts in a mount namespace */ +unsigned int sysctl_mount_max __read_mostly = 100000; + static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; @@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); + n->mounts += n->pending_mounts; + n->pending_mounts = 0; + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { + struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_mnt_namespace(p->mnt_ns); + ns = p->mnt_ns; + if (ns) { + ns->mounts--; + __touch_mnt_namespace(ns); + } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; @@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse) return 0; } +int count_mounts(struct mnt_namespace *ns, struct mount *mnt) +{ + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0, old, pending, sum; + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + mounts++; + + old = ns->mounts; + pending = ns->pending_mounts; + sum = old + pending; + if ((old > sum) || + (pending > sum) || + (max < sum) || + (mounts > (max - sum))) + return -ENOSPC; + + ns->pending_mounts = pending + mounts; + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct path *parent_path) { HLIST_HEAD(tree_list); + struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mount *child, *p; struct hlist_node *n; int err; + /* Is there space to add these mounts to the mount namespace? */ + if (!parent_path) { + err = count_mounts(ns, source_mnt); + if (err) + goto out; + } + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); + child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: + ns->pending_mounts = 0; return err; } @@ -2719,9 +2762,20 @@ dput_out: return retval; } +static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); +} + +static void dec_mnt_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); +} + static void free_mnt_ns(struct mnt_namespace *ns) { ns_free_inum(&ns->ns); + dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); } @@ -2738,14 +2792,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + struct ucounts *ucounts; int ret; + ucounts = inc_mnt_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); - if (!new_ns) + if (!new_ns) { + dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); + } ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); + dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } new_ns->ns.ops = &mntns_operations; @@ -2756,6 +2818,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) init_waitqueue_head(&new_ns->poll); new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->mounts = 0; + new_ns->pending_mounts = 0; return new_ns; } @@ -2805,6 +2870,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = new; while (p) { q->mnt_ns = new_ns; + new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -2843,6 +2909,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; + new_ns->mounts++; list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); @@ -3348,10 +3415,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *mntns_owner(struct ns_common *ns) +{ + return to_mnt_ns(ns)->user_ns; +} + const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, + .owner = mntns_owner, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7d620970f2e1..ca699ddc11c1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - written = generic_write_sync(iocb, result); + result = generic_write_sync(iocb, result); + if (result < 0) + goto out; + written = result; iocb->ki_pos += written; /* Return error values */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f5aecaabcb7c..a9dec32ba9ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7570,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_create_session(clp, status); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EACCES: + case -EAGAIN: + goto out; + }; + + clp->cl_seqid++; if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - if (clp->cl_seqid == res.seqid) - clp->cl_seqid++; if (status) goto out; nfs4_update_session(session, &res); @@ -8190,10 +8198,13 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, - be32_to_cpu(lrp->args.stateid.seqid)); - if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) + if (lrp->res.lrs_present) { + pnfs_mark_matching_lsegs_invalid(lo, &freeme, + &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); nfs4_sequence_free_slot(&lrp->res.seq_res); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6daf034645c8..2c93a85eda51 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); if (list_empty(&lo->plh_segs)) { - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + if (atomic_read(&lo->plh_outstanding) == 0) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); @@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { u32 oldseq, newseq, new_barrier = 0; - bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { + + if (!pnfs_layout_is_valid(lo)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + lo->plh_barrier = newseq; + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return; + } + if (pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); /* * Because of wraparound, we want to keep the barrier @@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, new_barrier = be32_to_cpu(new->seqid); else if (new_barrier == 0) return; - if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) lo->plh_barrier = new_barrier; } @@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } -static void -pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) -{ - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); -} - static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, enum pnfs_iomode *iomode) { + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; pnfs_get_layout_hdr(lo); @@ -1798,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) */ pnfs_mark_layout_stateid_invalid(lo, &free_me); - nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); - lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + pnfs_set_layout_stateid(lo, &res->stateid, true); } pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (!pnfs_layout_is_valid(lo)) { - pnfs_clear_layoutreturn_info(lo); - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - } if (res->return_on_close) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d2f97ecca6a5..e0e5f7c3c99f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response || - atomic_read(&group->fanotify_data.bypass_perm)); - - if (!event->response) { /* bypass_perm set */ - /* - * Event was canceled because group is being destroyed. Remove - * it from group's event list because we are responsible for - * freeing the permission event. - */ - fsnotify_remove_event(group, &event->fae.fse); - return 0; - } + wait_event(group->fanotify_data.access_waitq, event->response); /* userspace responded, convert to something usable */ switch (event->response) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8e8e6bcd1d43..a64313868d3a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + struct fsnotify_event *fsn_event; /* - * There may be still new events arriving in the notification queue - * but since userspace cannot use fanotify fd anymore, no event can - * enter or leave access_list by now. + * Stop new events from arriving in the notification queue. since + * userspace cannot use fanotify fd anymore, no event can enter or + * leave access_list by now either. */ - spin_lock(&group->fanotify_data.access_lock); - - atomic_inc(&group->fanotify_data.bypass_perm); + fsnotify_group_stop_queueing(group); + /* + * Process all permission events on access_list and notification queue + * and simulate reply from userspace. + */ + spin_lock(&group->fanotify_data.access_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) spin_unlock(&group->fanotify_data.access_lock); /* - * Since bypass_perm is set, newly queued events will not wait for - * access response. Wake up the already sleeping ones now. - * synchronize_srcu() in fsnotify_destroy_group() will wait for all - * processes sleeping in fanotify_handle_event() waiting for access - * response and thus also for all permission events to be freed. + * Destroy all non-permission events. For permission events just + * dequeue them and set the response. They will be freed once the + * response is consumed and fanotify_get_response() returns. */ + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + fsn_event = fsnotify_remove_first_event(group); + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, fsn_event); + else + FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + } + mutex_unlock(&group->notification_mutex); + + /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); #endif @@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: diff --git a/fs/notify/group.c b/fs/notify/group.c index 3e2dd85be5dd..b47f7cfdcaa4 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) } /* + * Stop queueing new events for this group. Once this function returns + * fsnotify_add_event() will not add any new events to the group's queue. + */ +void fsnotify_group_stop_queueing(struct fsnotify_group *group) +{ + mutex_lock(&group->notification_mutex); + group->shutdown = true; + mutex_unlock(&group->notification_mutex); +} + +/* * Trying to get rid of a group. Remove all marks, flush all events and release * the group reference. * Note that another thread calling fsnotify_clear_marks_by_group() may still @@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { + /* + * Stop queueing new events. The code below is careful enough to not + * require this but fanotify needs to stop queuing events even before + * fsnotify_destroy_group() is called and this makes the other callers + * of fsnotify_destroy_group() to see the same behavior. + */ + fsnotify_group_stop_queueing(group); + /* clear all inode marks for this group, attach them to destroy_list */ fsnotify_detach_group_marks(group); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index a95d8e037aeb..e455e83ceeeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * Add an event to the group notification queue. The group can later pull this * event off the queue to deal with. The function returns 0 if the event was * added to the queue, 1 if the event was merged with some other queued event, - * 2 if the queue of events has overflown. + * 2 if the event was not queued - either the queue of events has overflown + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, @@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group, mutex_lock(&group->notification_mutex); + if (group->shutdown) { + mutex_unlock(&group->notification_mutex); + return 2; + } + if (group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ @@ -126,21 +132,6 @@ queue: } /* - * Remove @event from group's notification queue. It is the responsibility of - * the caller to destroy the event. - */ -void fsnotify_remove_event(struct fsnotify_group *group, - struct fsnotify_event *event) -{ - mutex_lock(&group->notification_mutex); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - group->q_len--; - } - mutex_unlock(&group->notification_mutex); -} - -/* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ diff --git a/fs/nsfs.c b/fs/nsfs.c index 8f20d6016e20..30bb10034120 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -5,11 +5,16 @@ #include <linux/magic.h> #include <linux/ktime.h> #include <linux/seq_file.h> +#include <linux/user_namespace.h> +#include <linux/nsfs.h> static struct vfsmount *nsfs_mnt; +static long ns_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg); static const struct file_operations ns_file_operations = { .llseek = no_llseek, + .unlocked_ioctl = ns_ioctl, }; static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) @@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -void *ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops) +static void *__ns_get_path(struct path *path, struct ns_common *ns) { - struct vfsmount *mnt = mntget(nsfs_mnt); + struct vfsmount *mnt = nsfs_mnt; struct qstr qname = { .name = "", }; struct dentry *dentry; struct inode *inode; - struct ns_common *ns; unsigned long d; -again: - ns = ns_ops->get(task); - if (!ns) { - mntput(mnt); - return ERR_PTR(-ENOENT); - } rcu_read_lock(); d = atomic_long_read(&ns->stashed); if (!d) @@ -68,17 +65,16 @@ again: if (!lockref_get_not_dead(&dentry->d_lockref)) goto slow; rcu_read_unlock(); - ns_ops->put(ns); + ns->ops->put(ns); got_it: - path->mnt = mnt; + path->mnt = mntget(mnt); path->dentry = dentry; return NULL; slow: rcu_read_unlock(); inode = new_inode_pseudo(mnt->mnt_sb); if (!inode) { - ns_ops->put(ns); - mntput(mnt); + ns->ops->put(ns); return ERR_PTR(-ENOMEM); } inode->i_ino = ns->inum; @@ -91,21 +87,96 @@ slow: dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); if (!dentry) { iput(inode); - mntput(mnt); return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); - dentry->d_fsdata = (void *)ns_ops; + dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { d_delete(dentry); /* make sure ->d_prune() does nothing */ dput(dentry); cpu_relax(); - goto again; + return ERR_PTR(-EAGAIN); } goto got_it; } +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_common *ns; + void *ret; + +again: + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + ret = __ns_get_path(path, ns); + if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN) + goto again; + return ret; +} + +static int open_related_ns(struct ns_common *ns, + struct ns_common *(*get_ns)(struct ns_common *ns)) +{ + struct path path = {}; + struct file *f; + void *err; + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + while (1) { + struct ns_common *relative; + + relative = get_ns(ns); + if (IS_ERR(relative)) { + put_unused_fd(fd); + return PTR_ERR(relative); + } + + err = __ns_get_path(&path, relative); + if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN) + continue; + break; + } + if (IS_ERR(err)) { + put_unused_fd(fd); + return PTR_ERR(err); + } + + f = dentry_open(&path, O_RDONLY, current_cred()); + path_put(&path); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else + fd_install(fd, f); + + return fd; +} + +static long ns_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct ns_common *ns = get_proc_ns(file_inode(filp)); + + switch (ioctl) { + case NS_GET_USERNS: + return open_related_ns(ns, ns_get_owner); + case NS_GET_PARENT: + if (!ns->ops->get_parent) + return -EINVAL; + return open_related_ns(ns, ns->ops->get_parent); + default: + return -ENOTTY; + } +} + int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops) { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7dabbc31060e..f165f867f332 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5922,7 +5922,6 @@ bail: } static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, - handle_t *handle, struct inode *data_alloc_inode, struct buffer_head *data_alloc_bh) { @@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct ocfs2_truncate_log *tl; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; + handle_t *handle; di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail; + } + /* Caller has given us at least enough credits to * update the truncate log dinode */ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, @@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, } } - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_commit_trans(osb, handle); i--; } @@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; - handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct inode *data_alloc_inode = NULL; struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_unlock; - } - - status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + status = ocfs2_replay_truncate_records(osb, data_alloc_inode, data_alloc_bh); if (status < 0) mlog_errno(status); - ocfs2_commit_trans(osb, handle); - -out_unlock: brelse(data_alloc_bh); ocfs2_inode_unlock(data_alloc_inode, 1); @@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock; - } - while (head) { if (head->free_bg) bg_blkno = head->free_bg; else bg_blkno = ocfs2_which_suballoc_group(head->free_blk, head->free_bit); + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + trace_ocfs2_free_cached_blocks( (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); - if (ret) { + if (ret) mlog_errno(ret); - goto out_journal; - } - ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); - if (ret) { - mlog_errno(ret); - goto out_journal; - } + ocfs2_commit_trans(osb, handle); tmp = head; head = head->free_next; kfree(tmp); } -out_journal: - ocfs2_commit_trans(osb, handle); - out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 98d36548153d..bbb4b3e5b4ff 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1842,6 +1842,16 @@ out_commit: ocfs2_commit_trans(osb, handle); out: + /* + * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(), + * even in case of error here like ENOSPC and ENOMEM. So, we need + * to unlock the target page manually to prevent deadlocks when + * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED + * to VM code. + */ + if (wc->w_target_locked) + unlock_page(mmap_page); + ocfs2_free_write_ctxt(inode, wc); if (data_ac) { diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 94b18369b1cc..b95e7df5b76a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,9 +44,6 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * - * New in version 12 - * - Negotiate hb timeout when storage is down. - * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -78,7 +75,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 12ULL +#define O2NET_PROTOCOL_VERSION 11ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index cdeafb4e7ed6..0bb128659d4b 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags, int type) { enum dlm_status status; - u8 old_owner = res->owner; mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); @@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; - lock->convert_pending = 0; /* if it failed, move it back to granted queue. * if master returns DLM_NORMAL and then down before sending ast, * it may have already been moved to granted queue, reset to @@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); - } else if ((res->state & DLM_LOCK_RES_RECOVERING) || - (old_owner != res->owner)) { - mlog(0, "res %.*s is in recovering or has been recovered.\n", - res->lockname.len, res->lockname.name); + } else if (!lock->convert_pending) { + mlog(0, "%s: res %.*s, owner died and lock has been moved back " + "to granted list, retry convert.\n", + dlm->name, res->lockname.len, res->lockname.name); status = DLM_RECOVERING; } + + lock->convert_pending = 0; bail: spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4e7b0dc22450..0b055bfb8e86 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { int ret = 0; - u64 tmpend, end = start + len; + u64 tmpend = 0; + u64 end = start + len; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; @@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, } /* - * We want to get the byte offset of the end of the 1st cluster. + * If start is on a cluster boundary and end is somewhere in another + * cluster, we have not COWed the cluster starting at start, unless + * end is also within the same cluster. So, in this case, we skip this + * first call to ocfs2_zero_range_for_truncate() truncate and move on + * to the next one. */ - tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); - if (tmpend > end) - tmpend = end; + if ((start & (csize - 1)) != 0) { + /* + * We want to get the byte offset of the end of the 1st + * cluster. + */ + tmpend = (u64)osb->s_clustersize + + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; - trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, - (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1( + (unsigned long long)start, + (unsigned long long)tmpend); - ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); - if (ret) - mlog_errno(ret); + ret = ocfs2_zero_range_for_truncate(inode, handle, start, + tmpend); + if (ret) + mlog_errno(ret); + } if (tmpend < end) { /* diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index ea47120a85ff..6ad3533940ba 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1199,14 +1199,24 @@ retry: inode_unlock((*ac)->ac_inode); ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); - if (ret == 1) + if (ret == 1) { + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; goto retry; + } if (ret < 0) mlog_errno(ret); inode_lock((*ac)->ac_inode); - ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + inode_unlock((*ac)->ac_inode); + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; + goto bail; + } } if (status < 0) { if (status != -ENOSPC) diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 00235bf644dc..1e8fe844e69f 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) } } - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ret = 1; out_release_op: op_release(new_op); diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index a287a66d94e3..516ffb4dc9a0 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -11,14 +11,19 @@ #include "orangefs-kernel.h" #include "orangefs-dev-proto.h" #include "orangefs-bufmap.h" +#include "orangefs-debugfs.h" #include <linux/debugfs.h> #include <linux/slab.h> /* this file implements the /dev/pvfs2-req device node */ +uint32_t orangefs_userspace_version; + static int open_access_count; +static DEFINE_MUTEX(devreq_mutex); + #define DUMP_DEVICE_ERROR() \ do { \ gossip_err("*****************************************************\n");\ @@ -43,7 +48,7 @@ static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op) { int index = hash_func(op->tag, hash_table_size); - list_add_tail(&op->list, &htable_ops_in_progress[index]); + list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]); } /* @@ -57,20 +62,20 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag) index = hash_func(tag, hash_table_size); - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, - &htable_ops_in_progress[index], + &orangefs_htable_ops_in_progress[index], list) { if (op->tag == tag && !op_state_purged(op) && !op_state_given_up(op)) { list_del_init(&op->list); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); return op; } } - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); return NULL; } @@ -276,11 +281,11 @@ restart: if (ret != 0) goto error; - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); spin_lock(&cur_op->lock); if (unlikely(op_state_given_up(cur_op))) { spin_unlock(&cur_op->lock); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); complete(&cur_op->waitq); goto restart; } @@ -298,7 +303,7 @@ restart: current->comm); orangefs_devreq_add_op(cur_op); spin_unlock(&cur_op->lock); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); /* The client only asks to read one size buffer. */ return MAX_DEV_REQ_UPSIZE; @@ -387,6 +392,13 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, return -EPROTO; } + if (!orangefs_userspace_version) { + orangefs_userspace_version = head.version; + } else if (orangefs_userspace_version != head.version) { + gossip_err("Error: userspace version changes\n"); + return -EPROTO; + } + /* remove the op from the in progress hash table */ op = orangefs_devreq_remove_op(head.tag); if (!op) { @@ -527,6 +539,7 @@ static int orangefs_devreq_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: device close complete\n"); open_access_count = 0; + orangefs_userspace_version = 0; mutex_unlock(&devreq_mutex); return 0; } @@ -576,8 +589,6 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE; struct ORANGEFS_dev_map_desc user_desc; int ret = 0; - struct dev_mask_info_s mask_info = { 0 }; - struct dev_mask2_info_s mask2_info = { 0, 0 }; int upstream_kmod = 1; struct orangefs_sb_info_s *orangefs_sb; @@ -619,7 +630,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) * all of the remounts are serviced (to avoid ops between * mounts to fail) */ - ret = mutex_lock_interruptible(&request_mutex); + ret = mutex_lock_interruptible(&orangefs_request_mutex); if (ret < 0) return ret; gossip_debug(GOSSIP_DEV_DEBUG, @@ -654,7 +665,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) gossip_debug(GOSSIP_DEV_DEBUG, "%s: priority remount complete\n", __func__); - mutex_unlock(&request_mutex); + mutex_unlock(&orangefs_request_mutex); return ret; case ORANGEFS_DEV_UPSTREAM: @@ -668,134 +679,11 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) return ret; case ORANGEFS_DEV_CLIENT_MASK: - ret = copy_from_user(&mask2_info, - (void __user *)arg, - sizeof(struct dev_mask2_info_s)); - - if (ret != 0) - return -EIO; - - client_debug_mask.mask1 = mask2_info.mask1_value; - client_debug_mask.mask2 = mask2_info.mask2_value; - - pr_info("%s: client debug mask has been been received " - ":%llx: :%llx:\n", - __func__, - (unsigned long long)client_debug_mask.mask1, - (unsigned long long)client_debug_mask.mask2); - - return ret; - + return orangefs_debugfs_new_client_mask((void __user *)arg); case ORANGEFS_DEV_CLIENT_STRING: - ret = copy_from_user(&client_debug_array_string, - (void __user *)arg, - ORANGEFS_MAX_DEBUG_STRING_LEN); - /* - * The real client-core makes an effort to ensure - * that actual strings that aren't too long to fit in - * this buffer is what we get here. We're going to use - * string functions on the stuff we got, so we'll make - * this extra effort to try and keep from - * flowing out of this buffer when we use the string - * functions, even if somehow the stuff we end up - * with here is garbage. - */ - client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = - '\0'; - - if (ret != 0) { - pr_info("%s: CLIENT_STRING: copy_from_user failed\n", - __func__); - return -EIO; - } - - pr_info("%s: client debug array string has been received.\n", - __func__); - - if (!help_string_initialized) { - - /* Free the "we don't know yet" default string... */ - kfree(debug_help_string); - - /* build a proper debug help string */ - if (orangefs_prepare_debugfs_help_string(0)) { - gossip_err("%s: no debug help string \n", - __func__); - return -EIO; - } - - /* Replace the boilerplate boot-time debug-help file. */ - debugfs_remove(help_file_dentry); - - help_file_dentry = - debugfs_create_file( - ORANGEFS_KMOD_DEBUG_HELP_FILE, - 0444, - debug_dir, - debug_help_string, - &debug_help_fops); - - if (!help_file_dentry) { - gossip_err("%s: debugfs_create_file failed for" - " :%s:!\n", - __func__, - ORANGEFS_KMOD_DEBUG_HELP_FILE); - return -EIO; - } - } - - debug_mask_to_string(&client_debug_mask, 1); - - debugfs_remove(client_debug_dentry); - - orangefs_client_debug_init(); - - help_string_initialized++; - - return ret; - + return orangefs_debugfs_new_client_string((void __user *)arg); case ORANGEFS_DEV_DEBUG: - ret = copy_from_user(&mask_info, - (void __user *)arg, - sizeof(mask_info)); - - if (ret != 0) - return -EIO; - - if (mask_info.mask_type == KERNEL_MASK) { - if ((mask_info.mask_value == 0) - && (kernel_mask_set_mod_init)) { - /* - * the kernel debug mask was set when the - * kernel module was loaded; don't override - * it if the client-core was started without - * a value for ORANGEFS_KMODMASK. - */ - return 0; - } - debug_mask_to_string(&mask_info.mask_value, - mask_info.mask_type); - gossip_debug_mask = mask_info.mask_value; - pr_info("%s: kernel debug mask has been modified to " - ":%s: :%llx:\n", - __func__, - kernel_debug_string, - (unsigned long long)gossip_debug_mask); - } else if (mask_info.mask_type == CLIENT_MASK) { - debug_mask_to_string(&mask_info.mask_value, - mask_info.mask_type); - pr_info("%s: client debug mask has been modified to" - ":%s: :%llx:\n", - __func__, - client_debug_string, - llu(mask_info.mask_value)); - } else { - gossip_lerr("Invalid mask type....\n"); - return -EINVAL; - } - - return ret; - + return orangefs_debugfs_new_debug((void __user *)arg); default: return -ENOIOCTLCMD; } diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 66b99210f1f9..3b8923f8bf21 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -83,7 +83,10 @@ struct orangefs_listxattr_response { }; struct orangefs_param_response { - __s64 value; + union { + __s64 value64; + __s32 value32[2]; + } u; }; #define PERF_COUNT_BUF_SIZE 4096 @@ -98,6 +101,11 @@ struct orangefs_fs_key_response { char fs_key[FS_KEY_BUF_SIZE]; }; +/* 2.9.6 */ +struct orangefs_features_response { + __u64 features; +}; + struct orangefs_downcall_s { __s32 type; __s32 status; @@ -119,6 +127,7 @@ struct orangefs_downcall_s { struct orangefs_param_response param; struct orangefs_perf_count_response perf_count; struct orangefs_fs_key_response fs_key; + struct orangefs_features_response features; } resp; }; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 526040e09f78..3386886596d6 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,32 @@ #include <linux/fs.h> #include <linux/pagemap.h> +static int flush_racache(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: %pU: Handle is %pU | fs_id %d\n", __func__, + get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, + orangefs_inode->refn.fs_id); + + new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; + + ret = service_operation(new_op, "orangefs_flush_racache", + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", + __func__, ret); + + op_release(new_op); + return ret; +} + /* * Copy to client-core's address space from the buffers specified * by the iovec upto total_size bytes. @@ -386,7 +412,7 @@ ssize_t orangefs_inode_read(struct inode *inode, size_t bufmap_size; ssize_t ret = -EINVAL; - g_orangefs_stats.reads++; + orangefs_stats.reads++; bufmap_size = orangefs_bufmap_size_query(); if (count > bufmap_size) { @@ -427,7 +453,7 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); - g_orangefs_stats.reads++; + orangefs_stats.reads++; rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); iocb->ki_pos = pos; @@ -488,7 +514,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite } iocb->ki_pos = pos; - g_orangefs_stats.writes++; + orangefs_stats.writes++; out: @@ -591,15 +617,24 @@ static int orangefs_file_release(struct inode *inode, struct file *file) orangefs_flush_inode(inode); /* - * remove all associated inode pages from the page cache and mmap + * remove all associated inode pages from the page cache and * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ if (file->f_path.dentry->d_inode && file->f_path.dentry->d_inode->i_mapping && - mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) + mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) { + if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { + gossip_debug(GOSSIP_INODE_DEBUG, + "calling flush_racache on %pU\n", + get_khandle_from_ino(inode)); + flush_racache(inode); + gossip_debug(GOSSIP_INODE_DEBUG, + "flush_racache finished\n"); + } truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, 0); + } return 0; } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 62c525936ee8..35269e31de92 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -72,7 +72,7 @@ static int orangefs_create(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, @@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); if (IS_ERR(inode)) { @@ -322,7 +322,7 @@ static int orangefs_symlink(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, @@ -386,7 +386,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index b6edbe9fb309..aa3830b741c7 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op) return "OP_STATFS"; else if (type == ORANGEFS_VFS_OP_TRUNCATE) return "OP_TRUNCATE"; - else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH) - return "OP_MMAP_RA_FLUSH"; + else if (type == ORANGEFS_VFS_OP_RA_FLUSH) + return "OP_RA_FLUSH"; else if (type == ORANGEFS_VFS_OP_FS_MOUNT) return "OP_FS_MOUNT"; else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) @@ -97,6 +97,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op) return "OP_FSYNC"; else if (type == ORANGEFS_VFS_OP_FSKEY) return "OP_FSKEY"; + else if (type == ORANGEFS_VFS_OP_FEATURES) + return "OP_FEATURES"; } return "OP_UNKNOWN?"; } diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 1714a737d556..9b24107c82a8 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -43,36 +43,35 @@ #include "protocol.h" #include "orangefs-kernel.h" -static int orangefs_debug_disabled = 1; - -static int orangefs_debug_help_open(struct inode *, struct file *); +#define DEBUG_HELP_STRING_SIZE 4096 +#define HELP_STRING_UNINITIALIZED \ + "Client Debug Keywords are unknown until the first time\n" \ + "the client is started after boot.\n" +#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help" +#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug" +#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug" +#define ORANGEFS_VERBOSE "verbose" +#define ORANGEFS_ALL "all" -const struct file_operations debug_help_fops = { - .open = orangefs_debug_help_open, - .read = seq_read, - .release = seq_release, - .llseek = seq_lseek, +/* + * An array of client_debug_mask will be built to hold debug keyword/mask + * values fetched from userspace. + */ +struct client_debug_mask { + char *keyword; + __u64 mask1; + __u64 mask2; }; +static int orangefs_kernel_debug_init(void); + +static int orangefs_debug_help_open(struct inode *, struct file *); static void *help_start(struct seq_file *, loff_t *); static void *help_next(struct seq_file *, void *, loff_t *); static void help_stop(struct seq_file *, void *); static int help_show(struct seq_file *, void *); -static const struct seq_operations help_debug_ops = { - .start = help_start, - .next = help_next, - .stop = help_stop, - .show = help_show, -}; - -/* - * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and - * ORANGEFS_KMOD_DEBUG_FILE. - */ -static DEFINE_MUTEX(orangefs_debug_lock); - -int orangefs_debug_open(struct inode *, struct file *); +static int orangefs_debug_open(struct inode *, struct file *); static ssize_t orangefs_debug_read(struct file *, char __user *, @@ -84,6 +83,43 @@ static ssize_t orangefs_debug_write(struct file *, size_t, loff_t *); +static int orangefs_prepare_cdm_array(char *); +static void debug_mask_to_string(void *, int); +static void do_k_string(void *, int); +static void do_c_string(void *, int); +static int keyword_is_amalgam(char *); +static int check_amalgam_keyword(void *, int); +static void debug_string_to_mask(char *, void *, int); +static void do_c_mask(int, char *, struct client_debug_mask **); +static void do_k_mask(int, char *, __u64 **); + +static char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none"; +static char *debug_help_string; +static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; +static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; + +static struct dentry *help_file_dentry; +static struct dentry *client_debug_dentry; +static struct dentry *debug_dir; + +static unsigned int kernel_mask_set_mod_init; +static int orangefs_debug_disabled = 1; +static int help_string_initialized; + +static const struct seq_operations help_debug_ops = { + .start = help_start, + .next = help_next, + .stop = help_stop, + .show = help_show, +}; + +const struct file_operations debug_help_fops = { + .open = orangefs_debug_help_open, + .read = seq_read, + .release = seq_release, + .llseek = seq_lseek, +}; + static const struct file_operations kernel_debug_fops = { .open = orangefs_debug_open, .read = orangefs_debug_read, @@ -91,15 +127,55 @@ static const struct file_operations kernel_debug_fops = { .llseek = generic_file_llseek, }; +static int client_all_index; +static int client_verbose_index; + +static struct client_debug_mask *cdm_array; +static int cdm_element_count; + +static struct client_debug_mask client_debug_mask; + +/* + * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and + * ORANGEFS_KMOD_DEBUG_FILE. + */ +static DEFINE_MUTEX(orangefs_debug_lock); + /* * initialize kmod debug operations, create orangefs debugfs dir and * ORANGEFS_KMOD_DEBUG_HELP_FILE. */ -int orangefs_debugfs_init(void) +int orangefs_debugfs_init(int debug_mask) { - int rc = -ENOMEM; + /* convert input debug mask to a 64-bit unsigned integer */ + orangefs_gossip_debug_mask = (unsigned long long)debug_mask; + + /* + * set the kernel's gossip debug string; invalid mask values will + * be ignored. + */ + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); + + /* remove any invalid values from the mask */ + debug_string_to_mask(kernel_debug_string, &orangefs_gossip_debug_mask, + 0); + + /* + * if the mask has a non-zero value, then indicate that the mask + * was set when the kernel module was loaded. The orangefs dev ioctl + * command will look at this boolean to determine if the kernel's + * debug mask should be overwritten when the client-core is started. + */ + if (orangefs_gossip_debug_mask != 0) + kernel_mask_set_mod_init = true; + + pr_info("%s: called with debug mask: :%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + debug_dir = debugfs_create_dir("orangefs", NULL); if (!debug_dir) { pr_info("%s: debugfs_create_dir failed.\n", __func__); @@ -117,13 +193,58 @@ int orangefs_debugfs_init(void) } orangefs_debug_disabled = 0; + + rc = orangefs_kernel_debug_init(); + +out: + + return rc; +} + +/* + * initialize the kernel-debug file. + */ +static int orangefs_kernel_debug_init(void) +{ + int rc = -ENOMEM; + struct dentry *ret; + char *k_buffer = NULL; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); + + k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); + if (!k_buffer) + goto out; + + if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { + strcpy(k_buffer, kernel_debug_string); + strcat(k_buffer, "\n"); + } else { + strcpy(k_buffer, "none\n"); + pr_info("%s: overflow 1!\n", __func__); + } + + ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, + 0444, + debug_dir, + k_buffer, + &kernel_debug_fops); + if (!ret) { + pr_info("%s: failed to create %s.\n", + __func__, + ORANGEFS_KMOD_DEBUG_FILE); + goto out; + } + rc = 0; out: + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); return rc; } + void orangefs_debugfs_cleanup(void) { debugfs_remove_recursive(debug_dir); @@ -196,49 +317,6 @@ static int help_show(struct seq_file *m, void *v) } /* - * initialize the kernel-debug file. - */ -int orangefs_kernel_debug_init(void) -{ - int rc = -ENOMEM; - struct dentry *ret; - char *k_buffer = NULL; - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - - k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!k_buffer) - goto out; - - if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { - strcpy(k_buffer, kernel_debug_string); - strcat(k_buffer, "\n"); - } else { - strcpy(k_buffer, "none\n"); - pr_info("%s: overflow 1!\n", __func__); - } - - ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, - 0444, - debug_dir, - k_buffer, - &kernel_debug_fops); - if (!ret) { - pr_info("%s: failed to create %s.\n", - __func__, - ORANGEFS_KMOD_DEBUG_FILE); - goto out; - } - - rc = 0; - -out: - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); - return rc; -} - -/* * initialize the client-debug file. */ int orangefs_client_debug_init(void) @@ -282,7 +360,7 @@ out: } /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/ -int orangefs_debug_open(struct inode *inode, struct file *file) +static int orangefs_debug_open(struct inode *inode, struct file *file) { int rc = -ENODEV; @@ -384,8 +462,8 @@ static ssize_t orangefs_debug_write(struct file *file, */ if (!strcmp(file->f_path.dentry->d_name.name, ORANGEFS_KMOD_DEBUG_FILE)) { - debug_string_to_mask(buf, &gossip_debug_mask, 0); - debug_mask_to_string(&gossip_debug_mask, 0); + debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0); + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); debug_string = kernel_debug_string; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "New kernel debug string is %s\n", @@ -452,3 +530,546 @@ out: kfree(buf); return rc; } + +/* + * After obtaining a string representation of the client's debug + * keywords and their associated masks, this function is called to build an + * array of these values. + */ +static int orangefs_prepare_cdm_array(char *debug_array_string) +{ + int i; + int rc = -EINVAL; + char *cds_head = NULL; + char *cds_delimiter = NULL; + int keyword_len = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + /* + * figure out how many elements the cdm_array needs. + */ + for (i = 0; i < strlen(debug_array_string); i++) + if (debug_array_string[i] == '\n') + cdm_element_count++; + + if (!cdm_element_count) { + pr_info("No elements in client debug array string!\n"); + goto out; + } + + cdm_array = + kzalloc(cdm_element_count * sizeof(struct client_debug_mask), + GFP_KERNEL); + if (!cdm_array) { + pr_info("malloc failed for cdm_array!\n"); + rc = -ENOMEM; + goto out; + } + + cds_head = debug_array_string; + + for (i = 0; i < cdm_element_count; i++) { + cds_delimiter = strchr(cds_head, '\n'); + *cds_delimiter = '\0'; + + keyword_len = strcspn(cds_head, " "); + + cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL); + if (!cdm_array[i].keyword) { + rc = -ENOMEM; + goto out; + } + + sscanf(cds_head, + "%s %llx %llx", + cdm_array[i].keyword, + (unsigned long long *)&(cdm_array[i].mask1), + (unsigned long long *)&(cdm_array[i].mask2)); + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE)) + client_verbose_index = i; + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL)) + client_all_index = i; + + cds_head = cds_delimiter + 1; + } + + rc = cdm_element_count; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc); + +out: + + return rc; + +} + +/* + * /sys/kernel/debug/orangefs/debug-help can be catted to + * see all the available kernel and client debug keywords. + * + * When the kernel boots, we have no idea what keywords the + * client supports, nor their associated masks. + * + * We pass through this function once at boot and stamp a + * boilerplate "we don't know" message for the client in the + * debug-help file. We pass through here again when the client + * starts and then we can fill out the debug-help file fully. + * + * The client might be restarted any number of times between + * reboots, we only build the debug-help file the first time. + */ +int orangefs_prepare_debugfs_help_string(int at_boot) +{ + int rc = -EINVAL; + int i; + int byte_count = 0; + char *client_title = "Client Debug Keywords:\n"; + char *kernel_title = "Kernel Debug Keywords:\n"; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (at_boot) { + byte_count += strlen(HELP_STRING_UNINITIALIZED); + client_title = HELP_STRING_UNINITIALIZED; + } else { + /* + * fill the client keyword/mask array and remember + * how many elements there were. + */ + cdm_element_count = + orangefs_prepare_cdm_array(client_debug_array_string); + if (cdm_element_count <= 0) + goto out; + + /* Count the bytes destined for debug_help_string. */ + byte_count += strlen(client_title); + + for (i = 0; i < cdm_element_count; i++) { + byte_count += strlen(cdm_array[i].keyword + 2); + if (byte_count >= DEBUG_HELP_STRING_SIZE) { + pr_info("%s: overflow 1!\n", __func__); + goto out; + } + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: cdm_element_count:%d:\n", + __func__, + cdm_element_count); + } + + byte_count += strlen(kernel_title); + for (i = 0; i < num_kmod_keyword_mask_map; i++) { + byte_count += + strlen(s_kmod_keyword_mask_map[i].keyword + 2); + if (byte_count >= DEBUG_HELP_STRING_SIZE) { + pr_info("%s: overflow 2!\n", __func__); + goto out; + } + } + + /* build debug_help_string. */ + debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL); + if (!debug_help_string) { + rc = -ENOMEM; + goto out; + } + + strcat(debug_help_string, client_title); + + if (!at_boot) { + for (i = 0; i < cdm_element_count; i++) { + strcat(debug_help_string, "\t"); + strcat(debug_help_string, cdm_array[i].keyword); + strcat(debug_help_string, "\n"); + } + } + + strcat(debug_help_string, "\n"); + strcat(debug_help_string, kernel_title); + + for (i = 0; i < num_kmod_keyword_mask_map; i++) { + strcat(debug_help_string, "\t"); + strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword); + strcat(debug_help_string, "\n"); + } + + rc = 0; + +out: + + return rc; + +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_mask_to_string(void *mask, int type) +{ + int i; + int len = 0; + char *debug_string; + int element_count = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + debug_string = client_debug_string; + element_count = cdm_element_count; + } else { + debug_string = kernel_debug_string; + element_count = num_kmod_keyword_mask_map; + } + + memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); + + /* + * Some keywords, like "all" or "verbose", are amalgams of + * numerous other keywords. Make a special check for those + * before grinding through the whole mask only to find out + * later... + */ + if (check_amalgam_keyword(mask, type)) + goto out; + + /* Build the debug string. */ + for (i = 0; i < element_count; i++) + if (type) + do_c_string(mask, i); + else + do_k_string(mask, i); + + len = strlen(debug_string); + + if ((len) && (type)) + client_debug_string[len - 1] = '\0'; + else if (len) + kernel_debug_string[len - 1] = '\0'; + else if (type) + strcpy(client_debug_string, "none"); + else + strcpy(kernel_debug_string, "none"); + +out: +gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string); + + return; + +} + +static void do_k_string(void *k_mask, int index) +{ + __u64 *mask = (__u64 *) k_mask; + + if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword)) + goto out; + + if (*mask & s_kmod_keyword_mask_map[index].mask_val) { + if ((strlen(kernel_debug_string) + + strlen(s_kmod_keyword_mask_map[index].keyword)) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { + strcat(kernel_debug_string, + s_kmod_keyword_mask_map[index].keyword); + strcat(kernel_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(kernel_debug_string, ORANGEFS_ALL); + goto out; + } + } + +out: + + return; +} + +static void do_c_string(void *c_mask, int index) +{ + struct client_debug_mask *mask = (struct client_debug_mask *) c_mask; + + if (keyword_is_amalgam(cdm_array[index].keyword)) + goto out; + + if ((mask->mask1 & cdm_array[index].mask1) || + (mask->mask2 & cdm_array[index].mask2)) { + if ((strlen(client_debug_string) + + strlen(cdm_array[index].keyword) + 1) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { + strcat(client_debug_string, + cdm_array[index].keyword); + strcat(client_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(client_debug_string, ORANGEFS_ALL); + goto out; + } + } +out: + return; +} + +static int keyword_is_amalgam(char *keyword) +{ + int rc = 0; + + if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE))) + rc = 1; + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + * + * return 1 if we found an amalgam. + */ +static int check_amalgam_keyword(void *mask, int type) +{ + __u64 *k_mask; + struct client_debug_mask *c_mask; + int k_all_index = num_kmod_keyword_mask_map - 1; + int rc = 0; + + if (type) { + c_mask = (struct client_debug_mask *) mask; + + if ((c_mask->mask1 == cdm_array[client_all_index].mask1) && + (c_mask->mask2 == cdm_array[client_all_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + + if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) && + (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_VERBOSE); + rc = 1; + goto out; + } + + } else { + k_mask = (__u64 *) mask; + + if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) { + strcpy(kernel_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + } + +out: + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_string_to_mask(char *debug_string, void *mask, int type) +{ + char *unchecked_keyword; + int i; + char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL); + char *original_pointer; + int element_count = 0; + struct client_debug_mask *c_mask = NULL; + __u64 *k_mask = NULL; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + c_mask = (struct client_debug_mask *)mask; + element_count = cdm_element_count; + } else { + k_mask = (__u64 *)mask; + *k_mask = 0; + element_count = num_kmod_keyword_mask_map; + } + + original_pointer = strsep_fodder; + while ((unchecked_keyword = strsep(&strsep_fodder, ","))) + if (strlen(unchecked_keyword)) { + for (i = 0; i < element_count; i++) + if (type) + do_c_mask(i, + unchecked_keyword, + &c_mask); + else + do_k_mask(i, + unchecked_keyword, + &k_mask); + } + + kfree(original_pointer); +} + +static void do_c_mask(int i, char *unchecked_keyword, + struct client_debug_mask **sane_mask) +{ + + if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) { + (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1; + (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2; + } +} + +static void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask) +{ + + if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword)) + **sane_mask = (**sane_mask) | + s_kmod_keyword_mask_map[i].mask_val; +} + +int orangefs_debugfs_new_client_mask(void __user *arg) +{ + struct dev_mask2_info_s mask2_info = {0}; + int ret; + + ret = copy_from_user(&mask2_info, + (void __user *)arg, + sizeof(struct dev_mask2_info_s)); + + if (ret != 0) + return -EIO; + + client_debug_mask.mask1 = mask2_info.mask1_value; + client_debug_mask.mask2 = mask2_info.mask2_value; + + pr_info("%s: client debug mask has been been received " + ":%llx: :%llx:\n", + __func__, + (unsigned long long)client_debug_mask.mask1, + (unsigned long long)client_debug_mask.mask2); + + return ret; +} + +int orangefs_debugfs_new_client_string(void __user *arg) +{ + int ret; + + ret = copy_from_user(&client_debug_array_string, + (void __user *)arg, + ORANGEFS_MAX_DEBUG_STRING_LEN); + if (ret != 0) + return -EIO; + + /* + * The real client-core makes an effort to ensure + * that actual strings that aren't too long to fit in + * this buffer is what we get here. We're going to use + * string functions on the stuff we got, so we'll make + * this extra effort to try and keep from + * flowing out of this buffer when we use the string + * functions, even if somehow the stuff we end up + * with here is garbage. + */ + client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = + '\0'; + + if (ret != 0) { + pr_info("%s: CLIENT_STRING: copy_from_user failed\n", + __func__); + return -EIO; + } + + pr_info("%s: client debug array string has been received.\n", + __func__); + + if (!help_string_initialized) { + + /* Free the "we don't know yet" default string... */ + kfree(debug_help_string); + + /* build a proper debug help string */ + if (orangefs_prepare_debugfs_help_string(0)) { + gossip_err("%s: no debug help string \n", + __func__); + return -EIO; + } + + /* Replace the boilerplate boot-time debug-help file. */ + debugfs_remove(help_file_dentry); + + help_file_dentry = + debugfs_create_file( + ORANGEFS_KMOD_DEBUG_HELP_FILE, + 0444, + debug_dir, + debug_help_string, + &debug_help_fops); + + if (!help_file_dentry) { + gossip_err("%s: debugfs_create_file failed for" + " :%s:!\n", + __func__, + ORANGEFS_KMOD_DEBUG_HELP_FILE); + return -EIO; + } + } + + debug_mask_to_string(&client_debug_mask, 1); + + debugfs_remove(client_debug_dentry); + + orangefs_client_debug_init(); + + help_string_initialized++; + + return ret; +} + +int orangefs_debugfs_new_debug(void __user *arg) +{ + struct dev_mask_info_s mask_info = {0}; + int ret; + + ret = copy_from_user(&mask_info, + (void __user *)arg, + sizeof(mask_info)); + + if (ret != 0) + return -EIO; + + if (mask_info.mask_type == KERNEL_MASK) { + if ((mask_info.mask_value == 0) + && (kernel_mask_set_mod_init)) { + /* + * the kernel debug mask was set when the + * kernel module was loaded; don't override + * it if the client-core was started without + * a value for ORANGEFS_KMODMASK. + */ + return 0; + } + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + orangefs_gossip_debug_mask = mask_info.mask_value; + pr_info("%s: kernel debug mask has been modified to " + ":%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + } else if (mask_info.mask_type == CLIENT_MASK) { + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + pr_info("%s: client debug mask has been modified to" + ":%s: :%llx:\n", + __func__, + client_debug_string, + llu(mask_info.mask_value)); + } else { + gossip_lerr("Invalid mask type....\n"); + return -EINVAL; + } + + return ret; +} diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h index e4828c0e3ef9..803517269ba6 100644 --- a/fs/orangefs/orangefs-debugfs.h +++ b/fs/orangefs/orangefs-debugfs.h @@ -1,3 +1,7 @@ -int orangefs_debugfs_init(void); -int orangefs_kernel_debug_init(void); +int orangefs_debugfs_init(int); void orangefs_debugfs_cleanup(void); +int orangefs_client_debug_init(void); +int orangefs_prepare_debugfs_help_string(int); +int orangefs_debugfs_new_client_mask(void __user *); +int orangefs_debugfs_new_client_string(void __user *); +int orangefs_debugfs_new_debug(void __user *); diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index 9eac9d9a3f3a..a3d84ffee905 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -28,7 +28,7 @@ #define ORANGEFS_VFS_OP_RENAME 0xFF00000A #define ORANGEFS_VFS_OP_STATFS 0xFF00000B #define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C -#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D +#define ORANGEFS_VFS_OP_RA_FLUSH 0xFF00000D #define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E #define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F #define ORANGEFS_VFS_OP_GETXATTR 0xFF000010 @@ -41,6 +41,10 @@ #define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01 #define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02 #define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03 +#define ORANGEFS_VFS_OP_FEATURES 0xFF00EE05 /* 2.9.6 */ + +/* features is a 64-bit unsigned bitmask */ +#define ORANGEFS_FEATURE_READAHEAD 1 /* * Misc constants. Please retain them as multiples of 8! diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 633c07a6e3d8..0a82048f3aaf 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -100,16 +100,6 @@ enum orangefs_vfs_op_states { }; /* - * An array of client_debug_mask will be built to hold debug keyword/mask - * values fetched from userspace. - */ -struct client_debug_mask { - char *keyword; - __u64 mask1; - __u64 mask2; -}; - -/* * orangefs kernel memory related flags */ @@ -119,29 +109,6 @@ struct client_debug_mask { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */ -/* these functions are defined in orangefs-utils.c */ -int orangefs_prepare_cdm_array(char *debug_array_string); -int orangefs_prepare_debugfs_help_string(int); - -/* defined in orangefs-debugfs.c */ -int orangefs_client_debug_init(void); - -void debug_string_to_mask(char *, void *, int); -void do_c_mask(int, char *, struct client_debug_mask **); -void do_k_mask(int, char *, __u64 **); - -void debug_mask_to_string(void *, int); -void do_k_string(void *, int); -void do_c_string(void *, int); -int check_amalgam_keyword(void *, int); -int keyword_is_amalgam(char *); - -/*these variables are defined in orangefs-mod.c */ -extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern unsigned int kernel_mask_set_mod_init; - extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; @@ -331,7 +298,7 @@ struct orangefs_stats { unsigned long writes; }; -extern struct orangefs_stats g_orangefs_stats; +extern struct orangefs_stats orangefs_stats; /* * NOTE: See Documentation/filesystems/porting for information @@ -447,6 +414,8 @@ void purge_waiting_ops(void); /* * defined in super.c */ +extern uint64_t orangefs_features; + struct dentry *orangefs_mount(struct file_system_type *fst, int flags, const char *devname, @@ -506,6 +475,8 @@ ssize_t orangefs_inode_read(struct inode *inode, /* * defined in devorangefs-req.c */ +extern uint32_t orangefs_userspace_version; + int orangefs_dev_init(void); void orangefs_dev_cleanup(void); int is_daemon_in_service(void); @@ -543,20 +514,18 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op); int orangefs_normalize_to_errno(__s32 error_code); -extern struct mutex devreq_mutex; -extern struct mutex request_mutex; -extern int debug; +extern struct mutex orangefs_request_mutex; extern int op_timeout_secs; extern int slot_timeout_secs; -extern int dcache_timeout_msecs; -extern int getattr_timeout_msecs; +extern int orangefs_dcache_timeout_msecs; +extern int orangefs_getattr_timeout_msecs; extern struct list_head orangefs_superblocks; extern spinlock_t orangefs_superblocks_lock; extern struct list_head orangefs_request_list; extern spinlock_t orangefs_request_list_lock; extern wait_queue_head_t orangefs_request_list_waitq; -extern struct list_head *htable_ops_in_progress; -extern spinlock_t htable_ops_in_progress_lock; +extern struct list_head *orangefs_htable_ops_in_progress; +extern spinlock_t orangefs_htable_ops_in_progress_lock; extern int hash_table_size; extern const struct address_space_operations orangefs_address_operations; diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index e9fd5755c05f..2e5b03065f34 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -21,34 +21,17 @@ * global variables declared here */ -/* array of client debug keyword/mask values */ -struct client_debug_mask *cdm_array; -int cdm_element_count; - -char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none"; -char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; - -char *debug_help_string; -int help_string_initialized; -struct dentry *help_file_dentry; -struct dentry *client_debug_dentry; -struct dentry *debug_dir; -int client_verbose_index; -int client_all_index; -struct orangefs_stats g_orangefs_stats; +struct orangefs_stats orangefs_stats; /* the size of the hash tables for ops in progress */ int hash_table_size = 509; static ulong module_parm_debug_mask; -__u64 gossip_debug_mask; -struct client_debug_mask client_debug_mask = { NULL, 0, 0 }; -unsigned int kernel_mask_set_mod_init; /* implicitly false */ +__u64 orangefs_gossip_debug_mask; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; -int dcache_timeout_msecs = 50; -int getattr_timeout_msecs = 50; +int orangefs_dcache_timeout_msecs = 50; +int orangefs_getattr_timeout_msecs = 50; MODULE_LICENSE("GPL"); MODULE_AUTHOR("ORANGEFS Development Team"); @@ -71,20 +54,17 @@ module_param(module_parm_debug_mask, ulong, 0644); module_param(op_timeout_secs, int, 0); module_param(slot_timeout_secs, int, 0); -/* synchronizes the request device file */ -DEFINE_MUTEX(devreq_mutex); - /* * Blocks non-priority requests from being queued for servicing. This * could be used for protecting the request list data structure, but * for now it's only being used to stall the op addition to the request * list */ -DEFINE_MUTEX(request_mutex); +DEFINE_MUTEX(orangefs_request_mutex); /* hash table for storing operations waiting for matching downcall */ -struct list_head *htable_ops_in_progress; -DEFINE_SPINLOCK(htable_ops_in_progress_lock); +struct list_head *orangefs_htable_ops_in_progress; +DEFINE_SPINLOCK(orangefs_htable_ops_in_progress_lock); /* list for queueing upcall operations */ LIST_HEAD(orangefs_request_list); @@ -100,32 +80,6 @@ static int __init orangefs_init(void) int ret = -1; __u32 i = 0; - /* convert input debug mask to a 64-bit unsigned integer */ - gossip_debug_mask = (unsigned long long) module_parm_debug_mask; - - /* - * set the kernel's gossip debug string; invalid mask values will - * be ignored. - */ - debug_mask_to_string(&gossip_debug_mask, 0); - - /* remove any invalid values from the mask */ - debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0); - - /* - * if the mask has a non-zero value, then indicate that the mask - * was set when the kernel module was loaded. The orangefs dev ioctl - * command will look at this boolean to determine if the kernel's - * debug mask should be overwritten when the client-core is started. - */ - if (gossip_debug_mask != 0) - kernel_mask_set_mod_init = true; - - pr_info("%s: called with debug mask: :%s: :%llx:\n", - __func__, - kernel_debug_string, - (unsigned long long)gossip_debug_mask); - ret = bdi_init(&orangefs_backing_dev_info); if (ret) @@ -146,9 +100,9 @@ static int __init orangefs_init(void) if (ret < 0) goto cleanup_op; - htable_ops_in_progress = + orangefs_htable_ops_in_progress = kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL); - if (!htable_ops_in_progress) { + if (!orangefs_htable_ops_in_progress) { gossip_err("Failed to initialize op hashtable"); ret = -ENOMEM; goto cleanup_inode; @@ -156,7 +110,7 @@ static int __init orangefs_init(void) /* initialize a doubly linked at each hash table index */ for (i = 0; i < hash_table_size; i++) - INIT_LIST_HEAD(&htable_ops_in_progress[i]); + INIT_LIST_HEAD(&orangefs_htable_ops_in_progress[i]); ret = fsid_key_table_initialize(); if (ret < 0) @@ -179,14 +133,10 @@ static int __init orangefs_init(void) if (ret) goto cleanup_key_table; - ret = orangefs_debugfs_init(); + ret = orangefs_debugfs_init(module_parm_debug_mask); if (ret) goto debugfs_init_failed; - ret = orangefs_kernel_debug_init(); - if (ret) - goto kernel_debug_init_failed; - ret = orangefs_sysfs_init(); if (ret) goto sysfs_init_failed; @@ -214,8 +164,6 @@ cleanup_device: sysfs_init_failed: -kernel_debug_init_failed: - debugfs_init_failed: orangefs_debugfs_cleanup(); @@ -223,7 +171,7 @@ cleanup_key_table: fsid_key_table_finalize(); cleanup_progress_table: - kfree(htable_ops_in_progress); + kfree(orangefs_htable_ops_in_progress); cleanup_inode: orangefs_inode_cache_finalize(); @@ -250,12 +198,12 @@ static void __exit orangefs_exit(void) orangefs_dev_cleanup(); BUG_ON(!list_empty(&orangefs_request_list)); for (i = 0; i < hash_table_size; i++) - BUG_ON(!list_empty(&htable_ops_in_progress[i])); + BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i])); orangefs_inode_cache_finalize(); op_cache_finalize(); - kfree(htable_ops_in_progress); + kfree(orangefs_htable_ops_in_progress); bdi_destroy(&orangefs_backing_dev_info); @@ -274,10 +222,10 @@ void purge_inprogress_ops(void) struct orangefs_kernel_op_s *op; struct orangefs_kernel_op_s *next; - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, - &htable_ops_in_progress[i], + &orangefs_htable_ops_in_progress[i], list) { set_op_state_purged(op); gossip_debug(GOSSIP_DEV_DEBUG, @@ -287,7 +235,7 @@ void purge_inprogress_ops(void) op->op_state, current->comm); } - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); } } diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 375708c2db87..a799546a67f7 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -73,6 +73,24 @@ * Description: * Time getattr is valid in milliseconds. * + * What: /sys/fs/orangefs/readahead_count + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count. + * + * What: /sys/fs/orangefs/readahead_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer size. + * + * What: /sys/fs/orangefs/readahead_count_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count and size. + * * What: /sys/fs/orangefs/acache/... * Date: Jun 2015 * Contact: Martin Brandenburg <martin@omnibond.com> @@ -121,159 +139,34 @@ #define PC_KOBJ_ID "pc" #define STATS_KOBJ_ID "stats" -struct orangefs_obj { - struct kobject kobj; - int op_timeout_secs; - int perf_counter_reset; - int perf_history_size; - int perf_time_interval_secs; - int slot_timeout_secs; - int dcache_timeout_msecs; - int getattr_timeout_msecs; -}; - -struct acache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_msecs; -}; - -struct capcache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_secs; -}; - -struct ccache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_secs; -}; - -struct ncache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_msecs; -}; - -struct pc_orangefs_obj { - struct kobject kobj; - char *acache; - char *capcache; - char *ncache; -}; - -struct stats_orangefs_obj { - struct kobject kobj; - int reads; - int writes; -}; +/* + * Every item calls orangefs_attr_show and orangefs_attr_store through + * orangefs_sysfs_ops. They look at the orangefs_attributes further below to + * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or + * sysfs_service_op_store. + */ struct orangefs_attribute { struct attribute attr; - ssize_t (*show)(struct orangefs_obj *orangefs_obj, + ssize_t (*show)(struct kobject *kobj, struct orangefs_attribute *attr, char *buf); - ssize_t (*store)(struct orangefs_obj *orangefs_obj, + ssize_t (*store)(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count); }; -struct acache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct capcache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct ccache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct ncache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct pc_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct stats_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - static ssize_t orangefs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct orangefs_attribute *attribute; - struct orangefs_obj *orangefs_obj; - int rc; attribute = container_of(attr, struct orangefs_attribute, attr); - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(orangefs_obj, attribute, buf); - -out: - return rc; + if (!attribute->show) + return -EIO; + return attribute->show(kobj, attribute, buf); } static ssize_t orangefs_attr_store(struct kobject *kobj, @@ -282,24 +175,15 @@ static ssize_t orangefs_attr_store(struct kobject *kobj, size_t len) { struct orangefs_attribute *attribute; - struct orangefs_obj *orangefs_obj; - int rc; - gossip_debug(GOSSIP_SYSFS_DEBUG, - "orangefs_attr_store: start\n"); + if (!strcmp(kobj->name, PC_KOBJ_ID) || + !strcmp(kobj->name, STATS_KOBJ_ID)) + return -EPERM; attribute = container_of(attr, struct orangefs_attribute, attr); - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(orangefs_obj, attribute, buf, len); - -out: - return rc; + if (!attribute->store) + return -EIO; + return attribute->store(kobj, attribute, buf, len); } static const struct sysfs_ops orangefs_sysfs_ops = { @@ -307,402 +191,58 @@ static const struct sysfs_ops orangefs_sysfs_ops = { .store = orangefs_attr_store, }; -static ssize_t acache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct acache_orangefs_attribute *attribute; - struct acache_orangefs_obj *acache_orangefs_obj; - int rc; - - attribute = container_of(attr, struct acache_orangefs_attribute, attr); - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(acache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t acache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct acache_orangefs_attribute *attribute; - struct acache_orangefs_obj *acache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "acache_orangefs_attr_store: start\n"); - - attribute = container_of(attr, struct acache_orangefs_attribute, attr); - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(acache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops acache_orangefs_sysfs_ops = { - .show = acache_orangefs_attr_show, - .store = acache_orangefs_attr_store, -}; - -static ssize_t capcache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct capcache_orangefs_attribute *attribute; - struct capcache_orangefs_obj *capcache_orangefs_obj; - int rc; - - attribute = - container_of(attr, struct capcache_orangefs_attribute, attr); - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(capcache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t capcache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct capcache_orangefs_attribute *attribute; - struct capcache_orangefs_obj *capcache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "capcache_orangefs_attr_store: start\n"); - - attribute = - container_of(attr, struct capcache_orangefs_attribute, attr); - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(capcache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops capcache_orangefs_sysfs_ops = { - .show = capcache_orangefs_attr_show, - .store = capcache_orangefs_attr_store, -}; - -static ssize_t ccache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ccache_orangefs_attribute *attribute; - struct ccache_orangefs_obj *ccache_orangefs_obj; - int rc; - - attribute = - container_of(attr, struct ccache_orangefs_attribute, attr); - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(ccache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t ccache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct ccache_orangefs_attribute *attribute; - struct ccache_orangefs_obj *ccache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "ccache_orangefs_attr_store: start\n"); - - attribute = - container_of(attr, struct ccache_orangefs_attribute, attr); - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(ccache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops ccache_orangefs_sysfs_ops = { - .show = ccache_orangefs_attr_show, - .store = ccache_orangefs_attr_store, -}; - -static ssize_t ncache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ncache_orangefs_attribute *attribute; - struct ncache_orangefs_obj *ncache_orangefs_obj; - int rc; - - attribute = container_of(attr, struct ncache_orangefs_attribute, attr); - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(ncache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t ncache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct ncache_orangefs_attribute *attribute; - struct ncache_orangefs_obj *ncache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "ncache_orangefs_attr_store: start\n"); - - attribute = container_of(attr, struct ncache_orangefs_attribute, attr); - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(ncache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops ncache_orangefs_sysfs_ops = { - .show = ncache_orangefs_attr_show, - .store = ncache_orangefs_attr_store, -}; - -static ssize_t pc_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct pc_orangefs_attribute *attribute; - struct pc_orangefs_obj *pc_orangefs_obj; - int rc; - - attribute = container_of(attr, struct pc_orangefs_attribute, attr); - pc_orangefs_obj = - container_of(kobj, struct pc_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(pc_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static const struct sysfs_ops pc_orangefs_sysfs_ops = { - .show = pc_orangefs_attr_show, -}; - -static ssize_t stats_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct stats_orangefs_attribute *attribute; - struct stats_orangefs_obj *stats_orangefs_obj; - int rc; - - attribute = container_of(attr, struct stats_orangefs_attribute, attr); - stats_orangefs_obj = - container_of(kobj, struct stats_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(stats_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static const struct sysfs_ops stats_orangefs_sysfs_ops = { - .show = stats_orangefs_attr_show, -}; - -static void orangefs_release(struct kobject *kobj) -{ - struct orangefs_obj *orangefs_obj; - - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - kfree(orangefs_obj); -} - -static void acache_orangefs_release(struct kobject *kobj) -{ - struct acache_orangefs_obj *acache_orangefs_obj; - - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - kfree(acache_orangefs_obj); -} - -static void capcache_orangefs_release(struct kobject *kobj) -{ - struct capcache_orangefs_obj *capcache_orangefs_obj; - - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - kfree(capcache_orangefs_obj); -} - -static void ccache_orangefs_release(struct kobject *kobj) -{ - struct ccache_orangefs_obj *ccache_orangefs_obj; - - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - kfree(ccache_orangefs_obj); -} - -static void ncache_orangefs_release(struct kobject *kobj) -{ - struct ncache_orangefs_obj *ncache_orangefs_obj; - - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - kfree(ncache_orangefs_obj); -} - -static void pc_orangefs_release(struct kobject *kobj) -{ - struct pc_orangefs_obj *pc_orangefs_obj; - - pc_orangefs_obj = - container_of(kobj, struct pc_orangefs_obj, kobj); - kfree(pc_orangefs_obj); -} - -static void stats_orangefs_release(struct kobject *kobj) -{ - struct stats_orangefs_obj *stats_orangefs_obj; - - stats_orangefs_obj = - container_of(kobj, struct stats_orangefs_obj, kobj); - kfree(stats_orangefs_obj); -} - -static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr) +static ssize_t sysfs_int_show(struct kobject *kobj, + struct orangefs_attribute *attr, char *buf) { int rc = -EIO; - struct orangefs_attribute *orangefs_attr; - struct stats_orangefs_attribute *stats_orangefs_attr; - gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id); + gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", + kobj->name); - if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) { - orangefs_attr = (struct orangefs_attribute *)attr; - - if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) { + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "op_timeout_secs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", op_timeout_secs); goto out; - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", slot_timeout_secs); goto out; - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", - dcache_timeout_msecs); + orangefs_dcache_timeout_msecs); goto out; - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", - getattr_timeout_msecs); + orangefs_getattr_timeout_msecs); goto out; } else { goto out; } - } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) { - stats_orangefs_attr = (struct stats_orangefs_attribute *)attr; - - if (!strcmp(stats_orangefs_attr->attr.name, "reads")) { + } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "reads")) { rc = scnprintf(buf, PAGE_SIZE, "%lu\n", - g_orangefs_stats.reads); + orangefs_stats.reads); goto out; - } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) { + } else if (!strcmp(attr->attr.name, "writes")) { rc = scnprintf(buf, PAGE_SIZE, "%lu\n", - g_orangefs_stats.writes); + orangefs_stats.writes); goto out; } else { goto out; @@ -714,45 +254,13 @@ out: return rc; } -static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - char *buf) -{ - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "int_orangefs_show:start attr->attr.name:%s:\n", - attr->attr.name); - - rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr); - - return rc; -} - -static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - char *buf) -{ - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "int_stats_show:start attr->attr.name:%s:\n", - attr->attr.name); - - rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr); - - return rc; -} - -static ssize_t int_store(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - const char *buf, - size_t count) +static ssize_t sysfs_int_store(struct kobject *kobj, + struct orangefs_attribute *attr, const char *buf, size_t count) { int rc = 0; gossip_debug(GOSSIP_SYSFS_DEBUG, - "int_store: start attr->attr.name:%s: buf:%s:\n", + "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n", attr->attr.name, buf); if (!strcmp(attr->attr.name, "op_timeout_secs")) { @@ -762,10 +270,10 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj, rc = kstrtoint(buf, 0, &slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { - rc = kstrtoint(buf, 0, &dcache_timeout_msecs); + rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { - rc = kstrtoint(buf, 0, &getattr_timeout_msecs); + rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs); goto out; } else { goto out; @@ -783,24 +291,19 @@ out: /* * obtain attribute values from userspace with a service operation. */ -static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) +static ssize_t sysfs_service_op_show(struct kobject *kobj, + struct orangefs_attribute *attr, char *buf) { struct orangefs_kernel_op_s *new_op = NULL; int rc = 0; char *ser_op_type = NULL; - struct orangefs_attribute *orangefs_attr; - struct acache_orangefs_attribute *acache_attr; - struct capcache_orangefs_attribute *capcache_attr; - struct ccache_orangefs_attribute *ccache_attr; - struct ncache_orangefs_attribute *ncache_attr; - struct pc_orangefs_attribute *pc_attr; __u32 op_alloc_type; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_show: id:%s:\n", - kobj_id); + kobj->name); - if (strcmp(kobj_id, PC_KOBJ_ID)) + if (strcmp(kobj->name, PC_KOBJ_ID)) op_alloc_type = ORANGEFS_VFS_OP_PARAM; else op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT; @@ -818,124 +321,135 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) goto out; } - if (strcmp(kobj_id, PC_KOBJ_ID)) + if (strcmp(kobj->name, PC_KOBJ_ID)) new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET; - if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) { - orangefs_attr = (struct orangefs_attribute *)attr; + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + /* Drop unsupported requests first. */ + if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && + (!strcmp(attr->attr.name, "readahead_count") || + !strcmp(attr->attr.name, "readahead_size") || + !strcmp(attr->attr.name, "readahead_count_size"))) { + rc = -EINVAL; + goto out; + } - if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) + if (!strcmp(attr->attr.name, "perf_history_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; - else if (!strcmp(orangefs_attr->attr.name, + else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; - else if (!strcmp(orangefs_attr->attr.name, + else if (!strcmp(attr->attr.name, "perf_counter_reset")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; - } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) { - acache_attr = (struct acache_orangefs_attribute *)attr; + else if (!strcmp(attr->attr.name, + "readahead_count")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + + else if (!strcmp(attr->attr.name, + "readahead_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; - if (!strcmp(acache_attr->attr.name, "timeout_msecs")) + else if (!strcmp(attr->attr.name, + "readahead_count_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; - if (!strcmp(acache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; - if (!strcmp(acache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; - if (!strcmp(acache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) { - capcache_attr = (struct capcache_orangefs_attribute *)attr; - - if (!strcmp(capcache_attr->attr.name, "timeout_secs")) + } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; - if (!strcmp(capcache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; - if (!strcmp(capcache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; - if (!strcmp(capcache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) { - ccache_attr = (struct ccache_orangefs_attribute *)attr; - - if (!strcmp(ccache_attr->attr.name, "timeout_secs")) + } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; - if (!strcmp(ccache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; - if (!strcmp(ccache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; - if (!strcmp(ccache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) { - ncache_attr = (struct ncache_orangefs_attribute *)attr; - - if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) + } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; - if (!strcmp(ncache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; - if (!strcmp(ncache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; - if (!strcmp(ncache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, PC_KOBJ_ID)) { - pc_attr = (struct pc_orangefs_attribute *)attr; - - if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID)) + } else if (!strcmp(kobj->name, PC_KOBJ_ID)) { + if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_ACACHE; - if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID)) + if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE; - if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID)) + if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_NCACHE; } else { gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n", - kobj_id); + kobj->name); rc = -EINVAL; goto out; } - if (strcmp(kobj_id, PC_KOBJ_ID)) + if (strcmp(kobj->name, PC_KOBJ_ID)) ser_op_type = "orangefs_param"; else ser_op_type = "orangefs_perf_count"; @@ -948,11 +462,18 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) out: if (!rc) { - if (strcmp(kobj_id, PC_KOBJ_ID)) { - rc = scnprintf(buf, - PAGE_SIZE, - "%d\n", - (int)new_op->downcall.resp.param.value); + if (strcmp(kobj->name, PC_KOBJ_ID)) { + if (new_op->upcall.req.param.op == + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { + rc = scnprintf(buf, PAGE_SIZE, "%d %d\n", + (int)new_op->downcall.resp.param.u. + value32[0], + (int)new_op->downcall.resp.param.u. + value32[1]); + } else { + rc = scnprintf(buf, PAGE_SIZE, "%d\n", + (int)new_op->downcall.resp.param.u.value64); + } } else { rc = scnprintf( buf, @@ -968,77 +489,6 @@ out: } -static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t - service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t service_capcache_show(struct capcache_orangefs_obj - *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t service_ccache_show(struct ccache_orangefs_obj - *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t - service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t - service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr); - - return rc; -} - /* * pass attribute values back to userspace with a service operation. * @@ -1050,20 +500,16 @@ static ssize_t * We want to return 1 if we think everything went OK, and * EINVAL if not. */ -static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) +static ssize_t sysfs_service_op_store(struct kobject *kobj, + struct orangefs_attribute *attr, const char *buf, size_t count) { struct orangefs_kernel_op_s *new_op = NULL; int val = 0; int rc = 0; - struct orangefs_attribute *orangefs_attr; - struct acache_orangefs_attribute *acache_attr; - struct capcache_orangefs_attribute *capcache_attr; - struct ccache_orangefs_attribute *ccache_attr; - struct ncache_orangefs_attribute *ncache_attr; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_store: id:%s:\n", - kobj_id); + kobj->name); new_op = op_alloc(ORANGEFS_VFS_OP_PARAM); if (!new_op) @@ -1079,16 +525,29 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } /* - * The value we want to send back to userspace is in buf. + * The value we want to send back to userspace is in buf, unless this + * there are two parameters, which is specially handled below. */ - rc = kstrtoint(buf, 0, &val); - if (rc) - goto out; + if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) || + strcmp(attr->attr.name, "readahead_count_size")) { + rc = kstrtoint(buf, 0, &val); + if (rc) + goto out; + } - if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) { - orangefs_attr = (struct orangefs_attribute *)attr; + new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; + + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + /* Drop unsupported requests first. */ + if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && + (!strcmp(attr->attr.name, "readahead_count") || + !strcmp(attr->attr.name, "readahead_size") || + !strcmp(attr->attr.name, "readahead_count_size"))) { + rc = -EINVAL; + goto out; + } - if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) { + if (!strcmp(attr->attr.name, "perf_history_size")) { if (val > 0) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; @@ -1096,7 +555,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) { if (val > 0) { new_op->upcall.req.param.op = @@ -1105,7 +564,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "perf_counter_reset")) { if ((val == 0) || (val == 1)) { new_op->upcall.req.param.op = @@ -1114,12 +573,55 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } + } else if (!strcmp(attr->attr.name, + "readahead_count")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "readahead_size")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "readahead_count_size")) { + int val1, val2; + rc = sscanf(buf, "%d %d", &val1, &val2); + if (rc < 2) { + rc = 0; + goto out; + } + if ((val1 >= 0) && (val2 >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else { + rc = 0; + goto out; + } + new_op->upcall.req.param.u.value32[0] = val1; + new_op->upcall.req.param.u.value32[1] = val2; + goto value_set; + } else if (!strcmp(attr->attr.name, + "perf_counter_reset")) { + if ((val > 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else { + rc = 0; + goto out; + } } - } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) { - acache_attr = (struct acache_orangefs_attribute *)attr; - - if (!strcmp(acache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; @@ -1127,7 +629,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(acache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; @@ -1135,7 +637,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(acache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1144,7 +646,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) { + } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; @@ -1154,10 +656,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } } - } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) { - capcache_attr = (struct capcache_orangefs_attribute *)attr; - - if (!strcmp(capcache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; @@ -1165,7 +665,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; @@ -1173,7 +673,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(capcache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1182,7 +682,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) { + } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; @@ -1192,10 +692,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } } - } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) { - ccache_attr = (struct ccache_orangefs_attribute *)attr; - - if (!strcmp(ccache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; @@ -1203,7 +701,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; @@ -1211,7 +709,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ccache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1220,7 +718,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) { + } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; @@ -1230,10 +728,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } } - } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) { - ncache_attr = (struct ncache_orangefs_attribute *)attr; - - if (!strcmp(ncache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; @@ -1241,7 +737,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; @@ -1249,7 +745,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ncache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1258,7 +754,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) { + } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; @@ -1270,14 +766,13 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } else { gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n", - kobj_id); + kobj->name); rc = -EINVAL; goto out; } - new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; - - new_op->upcall.req.param.value = val; + new_op->upcall.req.param.u.value64 = val; +value_set: /* * The service_operation will return a errno return code on @@ -1290,7 +785,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc); rc = 0; } else { - rc = 1; + rc = count; } out: @@ -1302,127 +797,56 @@ out: return rc; } -static ssize_t - service_orangefs_store(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t - service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t - service_capcache_store(struct capcache_orangefs_obj - *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t service_ccache_store(struct ccache_orangefs_obj - *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t - service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - static struct orangefs_attribute op_timeout_secs_attribute = - __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store); + __ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute slot_timeout_secs_attribute = - __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store); + __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute dcache_timeout_msecs_attribute = - __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store); + __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute getattr_timeout_msecs_attribute = - __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store); + __ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); + +static struct orangefs_attribute readahead_count_attribute = + __ATTR(readahead_count, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute readahead_size_attribute = + __ATTR(readahead_size, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute readahead_count_size_attribute = + __ATTR(readahead_count_size, 0664, sysfs_service_op_show, + sysfs_service_op_store); static struct orangefs_attribute perf_counter_reset_attribute = __ATTR(perf_counter_reset, 0664, - service_orangefs_show, - service_orangefs_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct orangefs_attribute perf_history_size_attribute = __ATTR(perf_history_size, 0664, - service_orangefs_show, - service_orangefs_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct orangefs_attribute perf_time_interval_secs_attribute = __ATTR(perf_time_interval_secs, 0664, - service_orangefs_show, - service_orangefs_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *orangefs_default_attrs[] = { &op_timeout_secs_attribute.attr, &slot_timeout_secs_attribute.attr, &dcache_timeout_msecs_attribute.attr, &getattr_timeout_msecs_attribute.attr, + &readahead_count_attribute.attr, + &readahead_size_attribute.attr, + &readahead_count_size_attribute.attr, &perf_counter_reset_attribute.attr, &perf_history_size_attribute.attr, &perf_time_interval_secs_attribute.attr, @@ -1431,33 +855,32 @@ static struct attribute *orangefs_default_attrs[] = { static struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .release = orangefs_release, .default_attrs = orangefs_default_attrs, }; -static struct acache_orangefs_attribute acache_hard_limit_attribute = +static struct orangefs_attribute acache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct acache_orangefs_attribute acache_reclaim_percent_attribute = +static struct orangefs_attribute acache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct acache_orangefs_attribute acache_soft_limit_attribute = +static struct orangefs_attribute acache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct acache_orangefs_attribute acache_timeout_msecs_attribute = +static struct orangefs_attribute acache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *acache_orangefs_default_attrs[] = { &acache_hard_limit_attribute.attr, @@ -1468,34 +891,33 @@ static struct attribute *acache_orangefs_default_attrs[] = { }; static struct kobj_type acache_orangefs_ktype = { - .sysfs_ops = &acache_orangefs_sysfs_ops, - .release = acache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = acache_orangefs_default_attrs, }; -static struct capcache_orangefs_attribute capcache_hard_limit_attribute = +static struct orangefs_attribute capcache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute = +static struct orangefs_attribute capcache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct capcache_orangefs_attribute capcache_soft_limit_attribute = +static struct orangefs_attribute capcache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct capcache_orangefs_attribute capcache_timeout_secs_attribute = +static struct orangefs_attribute capcache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *capcache_orangefs_default_attrs[] = { &capcache_hard_limit_attribute.attr, @@ -1506,34 +928,33 @@ static struct attribute *capcache_orangefs_default_attrs[] = { }; static struct kobj_type capcache_orangefs_ktype = { - .sysfs_ops = &capcache_orangefs_sysfs_ops, - .release = capcache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = capcache_orangefs_default_attrs, }; -static struct ccache_orangefs_attribute ccache_hard_limit_attribute = +static struct orangefs_attribute ccache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute = +static struct orangefs_attribute ccache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ccache_orangefs_attribute ccache_soft_limit_attribute = +static struct orangefs_attribute ccache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ccache_orangefs_attribute ccache_timeout_secs_attribute = +static struct orangefs_attribute ccache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *ccache_orangefs_default_attrs[] = { &ccache_hard_limit_attribute.attr, @@ -1544,34 +965,33 @@ static struct attribute *ccache_orangefs_default_attrs[] = { }; static struct kobj_type ccache_orangefs_ktype = { - .sysfs_ops = &ccache_orangefs_sysfs_ops, - .release = ccache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = ccache_orangefs_default_attrs, }; -static struct ncache_orangefs_attribute ncache_hard_limit_attribute = +static struct orangefs_attribute ncache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute = +static struct orangefs_attribute ncache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ncache_orangefs_attribute ncache_soft_limit_attribute = +static struct orangefs_attribute ncache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute = +static struct orangefs_attribute ncache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *ncache_orangefs_default_attrs[] = { &ncache_hard_limit_attribute.attr, @@ -1582,27 +1002,26 @@ static struct attribute *ncache_orangefs_default_attrs[] = { }; static struct kobj_type ncache_orangefs_ktype = { - .sysfs_ops = &ncache_orangefs_sysfs_ops, - .release = ncache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = ncache_orangefs_default_attrs, }; -static struct pc_orangefs_attribute pc_acache_attribute = +static struct orangefs_attribute pc_acache_attribute = __ATTR(acache, 0664, - service_pc_show, + sysfs_service_op_show, NULL); -static struct pc_orangefs_attribute pc_capcache_attribute = +static struct orangefs_attribute pc_capcache_attribute = __ATTR(capcache, 0664, - service_pc_show, + sysfs_service_op_show, NULL); -static struct pc_orangefs_attribute pc_ncache_attribute = +static struct orangefs_attribute pc_ncache_attribute = __ATTR(ncache, 0664, - service_pc_show, + sysfs_service_op_show, NULL); static struct attribute *pc_orangefs_default_attrs[] = { @@ -1613,21 +1032,20 @@ static struct attribute *pc_orangefs_default_attrs[] = { }; static struct kobj_type pc_orangefs_ktype = { - .sysfs_ops = &pc_orangefs_sysfs_ops, - .release = pc_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = pc_orangefs_default_attrs, }; -static struct stats_orangefs_attribute stats_reads_attribute = +static struct orangefs_attribute stats_reads_attribute = __ATTR(reads, 0664, - int_stats_show, + sysfs_int_show, NULL); -static struct stats_orangefs_attribute stats_writes_attribute = +static struct orangefs_attribute stats_writes_attribute = __ATTR(writes, 0664, - int_stats_show, + sysfs_int_show, NULL); static struct attribute *stats_orangefs_default_attrs[] = { @@ -1637,18 +1055,17 @@ static struct attribute *stats_orangefs_default_attrs[] = { }; static struct kobj_type stats_orangefs_ktype = { - .sysfs_ops = &stats_orangefs_sysfs_ops, - .release = stats_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = stats_orangefs_default_attrs, }; -static struct orangefs_obj *orangefs_obj; -static struct acache_orangefs_obj *acache_orangefs_obj; -static struct capcache_orangefs_obj *capcache_orangefs_obj; -static struct ccache_orangefs_obj *ccache_orangefs_obj; -static struct ncache_orangefs_obj *ncache_orangefs_obj; -static struct pc_orangefs_obj *pc_orangefs_obj; -static struct stats_orangefs_obj *stats_orangefs_obj; +static struct kobject *orangefs_obj; +static struct kobject *acache_orangefs_obj; +static struct kobject *capcache_orangefs_obj; +static struct kobject *ccache_orangefs_obj; +static struct kobject *ncache_orangefs_obj; +static struct kobject *pc_orangefs_obj; +static struct kobject *stats_orangefs_obj; int orangefs_sysfs_init(void) { @@ -1661,7 +1078,7 @@ int orangefs_sysfs_init(void) if (!orangefs_obj) goto out; - rc = kobject_init_and_add(&orangefs_obj->kobj, + rc = kobject_init_and_add(orangefs_obj, &orangefs_ktype, fs_kobj, ORANGEFS_KOBJ_ID); @@ -1669,7 +1086,7 @@ int orangefs_sysfs_init(void) if (rc) goto ofs_obj_bail; - kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/acache. */ acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL); @@ -1678,15 +1095,15 @@ int orangefs_sysfs_init(void) goto ofs_obj_bail; } - rc = kobject_init_and_add(&acache_orangefs_obj->kobj, + rc = kobject_init_and_add(acache_orangefs_obj, &acache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, ACACHE_KOBJ_ID); if (rc) goto acache_obj_bail; - kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(acache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/capcache. */ capcache_orangefs_obj = @@ -1696,14 +1113,14 @@ int orangefs_sysfs_init(void) goto acache_obj_bail; } - rc = kobject_init_and_add(&capcache_orangefs_obj->kobj, + rc = kobject_init_and_add(capcache_orangefs_obj, &capcache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, CAPCACHE_KOBJ_ID); if (rc) goto capcache_obj_bail; - kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(capcache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ccache. */ ccache_orangefs_obj = @@ -1713,14 +1130,14 @@ int orangefs_sysfs_init(void) goto capcache_obj_bail; } - rc = kobject_init_and_add(&ccache_orangefs_obj->kobj, + rc = kobject_init_and_add(ccache_orangefs_obj, &ccache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, CCACHE_KOBJ_ID); if (rc) goto ccache_obj_bail; - kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(ccache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ncache. */ ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL); @@ -1729,15 +1146,15 @@ int orangefs_sysfs_init(void) goto ccache_obj_bail; } - rc = kobject_init_and_add(&ncache_orangefs_obj->kobj, + rc = kobject_init_and_add(ncache_orangefs_obj, &ncache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, NCACHE_KOBJ_ID); if (rc) goto ncache_obj_bail; - kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(ncache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/perf_counters. */ pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL); @@ -1746,15 +1163,15 @@ int orangefs_sysfs_init(void) goto ncache_obj_bail; } - rc = kobject_init_and_add(&pc_orangefs_obj->kobj, + rc = kobject_init_and_add(pc_orangefs_obj, &pc_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, "perf_counters"); if (rc) goto pc_obj_bail; - kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(pc_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/stats. */ stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL); @@ -1763,37 +1180,31 @@ int orangefs_sysfs_init(void) goto pc_obj_bail; } - rc = kobject_init_and_add(&stats_orangefs_obj->kobj, + rc = kobject_init_and_add(stats_orangefs_obj, &stats_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, STATS_KOBJ_ID); if (rc) goto stats_obj_bail; - kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(stats_orangefs_obj, KOBJ_ADD); goto out; stats_obj_bail: - kobject_put(&stats_orangefs_obj->kobj); - + kobject_put(stats_orangefs_obj); pc_obj_bail: - kobject_put(&pc_orangefs_obj->kobj); - + kobject_put(pc_orangefs_obj); ncache_obj_bail: - kobject_put(&ncache_orangefs_obj->kobj); - + kobject_put(ncache_orangefs_obj); ccache_obj_bail: - kobject_put(&ccache_orangefs_obj->kobj); - + kobject_put(ccache_orangefs_obj); capcache_obj_bail: - kobject_put(&capcache_orangefs_obj->kobj); - + kobject_put(capcache_orangefs_obj); acache_obj_bail: - kobject_put(&acache_orangefs_obj->kobj); - + kobject_put(acache_orangefs_obj); ofs_obj_bail: - kobject_put(&orangefs_obj->kobj); + kobject_put(orangefs_obj); out: return rc; } @@ -1801,13 +1212,11 @@ out: void orangefs_sysfs_exit(void) { gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n"); - - kobject_put(&acache_orangefs_obj->kobj); - kobject_put(&capcache_orangefs_obj->kobj); - kobject_put(&ccache_orangefs_obj->kobj); - kobject_put(&ncache_orangefs_obj->kobj); - kobject_put(&pc_orangefs_obj->kobj); - kobject_put(&stats_orangefs_obj->kobj); - - kobject_put(&orangefs_obj->kobj); + kobject_put(acache_orangefs_obj); + kobject_put(capcache_orangefs_obj); + kobject_put(ccache_orangefs_obj); + kobject_put(ncache_orangefs_obj); + kobject_put(pc_orangefs_obj); + kobject_put(stats_orangefs_obj); + kobject_put(orangefs_obj); } diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index d13c7291fd05..06af81f71e10 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -50,7 +50,7 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op) case ORANGEFS_VFS_OP_TRUNCATE: fsid = op->upcall.req.truncate.refn.fs_id; break; - case ORANGEFS_VFS_OP_MMAP_RA_FLUSH: + case ORANGEFS_VFS_OP_RA_FLUSH: fsid = op->upcall.req.ra_cache_flush.refn.fs_id; break; case ORANGEFS_VFS_OP_FS_UMOUNT: @@ -347,7 +347,8 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes); - orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000; + orangefs_inode->getattr_time = jiffies + + orangefs_getattr_timeout_msecs*HZ/1000; ret = 0; out: op_release(new_op); @@ -656,401 +657,3 @@ __s32 ORANGEFS_util_translate_mode(int mode) return ret; } #undef NUM_MODES - -/* - * After obtaining a string representation of the client's debug - * keywords and their associated masks, this function is called to build an - * array of these values. - */ -int orangefs_prepare_cdm_array(char *debug_array_string) -{ - int i; - int rc = -EINVAL; - char *cds_head = NULL; - char *cds_delimiter = NULL; - int keyword_len = 0; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - /* - * figure out how many elements the cdm_array needs. - */ - for (i = 0; i < strlen(debug_array_string); i++) - if (debug_array_string[i] == '\n') - cdm_element_count++; - - if (!cdm_element_count) { - pr_info("No elements in client debug array string!\n"); - goto out; - } - - cdm_array = - kzalloc(cdm_element_count * sizeof(struct client_debug_mask), - GFP_KERNEL); - if (!cdm_array) { - pr_info("malloc failed for cdm_array!\n"); - rc = -ENOMEM; - goto out; - } - - cds_head = debug_array_string; - - for (i = 0; i < cdm_element_count; i++) { - cds_delimiter = strchr(cds_head, '\n'); - *cds_delimiter = '\0'; - - keyword_len = strcspn(cds_head, " "); - - cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL); - if (!cdm_array[i].keyword) { - rc = -ENOMEM; - goto out; - } - - sscanf(cds_head, - "%s %llx %llx", - cdm_array[i].keyword, - (unsigned long long *)&(cdm_array[i].mask1), - (unsigned long long *)&(cdm_array[i].mask2)); - - if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE)) - client_verbose_index = i; - - if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL)) - client_all_index = i; - - cds_head = cds_delimiter + 1; - } - - rc = cdm_element_count; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc); - -out: - - return rc; - -} - -/* - * /sys/kernel/debug/orangefs/debug-help can be catted to - * see all the available kernel and client debug keywords. - * - * When the kernel boots, we have no idea what keywords the - * client supports, nor their associated masks. - * - * We pass through this function once at boot and stamp a - * boilerplate "we don't know" message for the client in the - * debug-help file. We pass through here again when the client - * starts and then we can fill out the debug-help file fully. - * - * The client might be restarted any number of times between - * reboots, we only build the debug-help file the first time. - */ -int orangefs_prepare_debugfs_help_string(int at_boot) -{ - int rc = -EINVAL; - int i; - int byte_count = 0; - char *client_title = "Client Debug Keywords:\n"; - char *kernel_title = "Kernel Debug Keywords:\n"; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - if (at_boot) { - byte_count += strlen(HELP_STRING_UNINITIALIZED); - client_title = HELP_STRING_UNINITIALIZED; - } else { - /* - * fill the client keyword/mask array and remember - * how many elements there were. - */ - cdm_element_count = - orangefs_prepare_cdm_array(client_debug_array_string); - if (cdm_element_count <= 0) - goto out; - - /* Count the bytes destined for debug_help_string. */ - byte_count += strlen(client_title); - - for (i = 0; i < cdm_element_count; i++) { - byte_count += strlen(cdm_array[i].keyword + 2); - if (byte_count >= DEBUG_HELP_STRING_SIZE) { - pr_info("%s: overflow 1!\n", __func__); - goto out; - } - } - - gossip_debug(GOSSIP_UTILS_DEBUG, - "%s: cdm_element_count:%d:\n", - __func__, - cdm_element_count); - } - - byte_count += strlen(kernel_title); - for (i = 0; i < num_kmod_keyword_mask_map; i++) { - byte_count += - strlen(s_kmod_keyword_mask_map[i].keyword + 2); - if (byte_count >= DEBUG_HELP_STRING_SIZE) { - pr_info("%s: overflow 2!\n", __func__); - goto out; - } - } - - /* build debug_help_string. */ - debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL); - if (!debug_help_string) { - rc = -ENOMEM; - goto out; - } - - strcat(debug_help_string, client_title); - - if (!at_boot) { - for (i = 0; i < cdm_element_count; i++) { - strcat(debug_help_string, "\t"); - strcat(debug_help_string, cdm_array[i].keyword); - strcat(debug_help_string, "\n"); - } - } - - strcat(debug_help_string, "\n"); - strcat(debug_help_string, kernel_title); - - for (i = 0; i < num_kmod_keyword_mask_map; i++) { - strcat(debug_help_string, "\t"); - strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword); - strcat(debug_help_string, "\n"); - } - - rc = 0; - -out: - - return rc; - -} - -/* - * kernel = type 0 - * client = type 1 - */ -void debug_mask_to_string(void *mask, int type) -{ - int i; - int len = 0; - char *debug_string; - int element_count = 0; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - if (type) { - debug_string = client_debug_string; - element_count = cdm_element_count; - } else { - debug_string = kernel_debug_string; - element_count = num_kmod_keyword_mask_map; - } - - memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); - - /* - * Some keywords, like "all" or "verbose", are amalgams of - * numerous other keywords. Make a special check for those - * before grinding through the whole mask only to find out - * later... - */ - if (check_amalgam_keyword(mask, type)) - goto out; - - /* Build the debug string. */ - for (i = 0; i < element_count; i++) - if (type) - do_c_string(mask, i); - else - do_k_string(mask, i); - - len = strlen(debug_string); - - if ((len) && (type)) - client_debug_string[len - 1] = '\0'; - else if (len) - kernel_debug_string[len - 1] = '\0'; - else if (type) - strcpy(client_debug_string, "none"); - else - strcpy(kernel_debug_string, "none"); - -out: -gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string); - - return; - -} - -void do_k_string(void *k_mask, int index) -{ - __u64 *mask = (__u64 *) k_mask; - - if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword)) - goto out; - - if (*mask & s_kmod_keyword_mask_map[index].mask_val) { - if ((strlen(kernel_debug_string) + - strlen(s_kmod_keyword_mask_map[index].keyword)) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { - strcat(kernel_debug_string, - s_kmod_keyword_mask_map[index].keyword); - strcat(kernel_debug_string, ","); - } else { - gossip_err("%s: overflow!\n", __func__); - strcpy(kernel_debug_string, ORANGEFS_ALL); - goto out; - } - } - -out: - - return; -} - -void do_c_string(void *c_mask, int index) -{ - struct client_debug_mask *mask = (struct client_debug_mask *) c_mask; - - if (keyword_is_amalgam(cdm_array[index].keyword)) - goto out; - - if ((mask->mask1 & cdm_array[index].mask1) || - (mask->mask2 & cdm_array[index].mask2)) { - if ((strlen(client_debug_string) + - strlen(cdm_array[index].keyword) + 1) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { - strcat(client_debug_string, - cdm_array[index].keyword); - strcat(client_debug_string, ","); - } else { - gossip_err("%s: overflow!\n", __func__); - strcpy(client_debug_string, ORANGEFS_ALL); - goto out; - } - } -out: - return; -} - -int keyword_is_amalgam(char *keyword) -{ - int rc = 0; - - if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE))) - rc = 1; - - return rc; -} - -/* - * kernel = type 0 - * client = type 1 - * - * return 1 if we found an amalgam. - */ -int check_amalgam_keyword(void *mask, int type) -{ - __u64 *k_mask; - struct client_debug_mask *c_mask; - int k_all_index = num_kmod_keyword_mask_map - 1; - int rc = 0; - - if (type) { - c_mask = (struct client_debug_mask *) mask; - - if ((c_mask->mask1 == cdm_array[client_all_index].mask1) && - (c_mask->mask2 == cdm_array[client_all_index].mask2)) { - strcpy(client_debug_string, ORANGEFS_ALL); - rc = 1; - goto out; - } - - if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) && - (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) { - strcpy(client_debug_string, ORANGEFS_VERBOSE); - rc = 1; - goto out; - } - - } else { - k_mask = (__u64 *) mask; - - if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) { - strcpy(kernel_debug_string, ORANGEFS_ALL); - rc = 1; - goto out; - } - } - -out: - - return rc; -} - -/* - * kernel = type 0 - * client = type 1 - */ -void debug_string_to_mask(char *debug_string, void *mask, int type) -{ - char *unchecked_keyword; - int i; - char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL); - char *original_pointer; - int element_count = 0; - struct client_debug_mask *c_mask; - __u64 *k_mask; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - if (type) { - c_mask = (struct client_debug_mask *)mask; - element_count = cdm_element_count; - } else { - k_mask = (__u64 *)mask; - *k_mask = 0; - element_count = num_kmod_keyword_mask_map; - } - - original_pointer = strsep_fodder; - while ((unchecked_keyword = strsep(&strsep_fodder, ","))) - if (strlen(unchecked_keyword)) { - for (i = 0; i < element_count; i++) - if (type) - do_c_mask(i, - unchecked_keyword, - &c_mask); - else - do_k_mask(i, - unchecked_keyword, - &k_mask); - } - - kfree(original_pointer); -} - -void do_c_mask(int i, - char *unchecked_keyword, - struct client_debug_mask **sane_mask) -{ - - if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) { - (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1; - (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2; - } -} - -void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask) -{ - - if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword)) - **sane_mask = (**sane_mask) | - s_kmod_keyword_mask_map[i].mask_val; -} diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 3d7418c728f5..971307ad69be 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -4,26 +4,6 @@ #include <linux/slab.h> #include <linux/ioctl.h> -extern struct client_debug_mask *cdm_array; -extern char *debug_help_string; -extern int help_string_initialized; -extern struct dentry *debug_dir; -extern struct dentry *help_file_dentry; -extern struct dentry *client_debug_dentry; -extern const struct file_operations debug_help_fops; -extern int client_all_index; -extern int client_verbose_index; -extern int cdm_element_count; -#define DEBUG_HELP_STRING_SIZE 4096 -#define HELP_STRING_UNINITIALIZED \ - "Client Debug Keywords are unknown until the first time\n" \ - "the client is started after boot.\n" -#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help" -#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug" -#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug" -#define ORANGEFS_VERBOSE "verbose" -#define ORANGEFS_ALL "all" - /* pvfs2-config.h ***********************************************************/ #define ORANGEFS_VERSION_MAJOR 2 #define ORANGEFS_VERSION_MINOR 9 @@ -426,13 +406,12 @@ do { \ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ } while (0) #else -extern __u64 gossip_debug_mask; -extern struct client_debug_mask client_debug_mask; +extern __u64 orangefs_gossip_debug_mask; /* try to avoid function call overhead by checking masks in macro */ #define gossip_debug(mask, fmt, ...) \ do { \ - if (gossip_debug_mask & (mask)) \ + if (orangefs_gossip_debug_mask & (mask)) \ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ } while (0) #endif /* GOSSIP_DISABLE_DEBUG */ diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index b9da9a0281c9..c48859f16e7b 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -33,6 +33,7 @@ static const match_table_t tokens = { { Opt_err, NULL } }; +uint64_t orangefs_features; static int parse_mount_options(struct super_block *sb, char *options, int silent) @@ -249,6 +250,19 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb) } op_release(new_op); + + if (orangefs_userspace_version >= 20906) { + new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.features.features = 0; + ret = service_operation(new_op, "orangefs_features", 0); + orangefs_features = new_op->downcall.resp.features.features; + op_release(new_op); + } else { + orangefs_features = 0; + } + return ret; } @@ -492,6 +506,19 @@ struct dentry *orangefs_mount(struct file_system_type *fst, list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks); spin_unlock(&orangefs_superblocks_lock); op_release(new_op); + + if (orangefs_userspace_version >= 20906) { + new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); + if (!new_op) + return ERR_PTR(-ENOMEM); + new_op->upcall.req.features.features = 0; + ret = service_operation(new_op, "orangefs_features", 0); + orangefs_features = new_op->downcall.resp.features.features; + op_release(new_op); + } else { + orangefs_features = 0; + } + return dget(sb->s_root); free_op: @@ -530,8 +557,8 @@ void orangefs_kill_sb(struct super_block *sb) * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us * gets completed before we free the dang thing. */ - mutex_lock(&request_mutex); - mutex_unlock(&request_mutex); + mutex_lock(&orangefs_request_mutex); + mutex_unlock(&orangefs_request_mutex); /* free the orangefs superblock private data */ kfree(ORANGEFS_SB(sb)); diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h index 001b20239407..af0b0e36d559 100644 --- a/fs/orangefs/upcall.h +++ b/fs/orangefs/upcall.h @@ -98,7 +98,7 @@ struct orangefs_truncate_request_s { __s64 size; }; -struct orangefs_mmap_ra_cache_flush_request_s { +struct orangefs_ra_cache_flush_request_s { struct orangefs_object_kref refn; }; @@ -179,12 +179,18 @@ enum orangefs_param_request_op { ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23, ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24, ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28, }; struct orangefs_param_request_s { enum orangefs_param_request_type type; enum orangefs_param_request_op op; - __s64 value; + union { + __s64 value64; + __s32 value32[2]; + } u; char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN]; }; @@ -204,6 +210,11 @@ struct orangefs_fs_key_request_s { __s32 __pad1; }; +/* 2.9.6 */ +struct orangefs_features_request_s { + __u64 features; +}; + struct orangefs_upcall_s { __s32 type; __u32 uid; @@ -228,7 +239,7 @@ struct orangefs_upcall_s { struct orangefs_rename_request_s rename; struct orangefs_statfs_request_s statfs; struct orangefs_truncate_request_s truncate; - struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush; + struct orangefs_ra_cache_flush_request_s ra_cache_flush; struct orangefs_fs_mount_request_s fs_mount; struct orangefs_fs_umount_request_s fs_umount; struct orangefs_getxattr_request_s getxattr; @@ -240,6 +251,7 @@ struct orangefs_upcall_s { struct orangefs_param_request_s param; struct orangefs_perf_count_request_s perf_count; struct orangefs_fs_key_request_s fs_key; + struct orangefs_features_request_s features; } req; }; diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 31635bc303fe..abcfa3fa9992 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -87,9 +87,9 @@ retry_servicing: */ if (!(flags & ORANGEFS_OP_NO_MUTEX)) { if (flags & ORANGEFS_OP_INTERRUPTIBLE) - ret = mutex_lock_interruptible(&request_mutex); + ret = mutex_lock_interruptible(&orangefs_request_mutex); else - ret = mutex_lock_killable(&request_mutex); + ret = mutex_lock_killable(&orangefs_request_mutex); /* * check to see if we were interrupted while waiting for * mutex @@ -129,7 +129,7 @@ retry_servicing: spin_unlock(&orangefs_request_list_lock); if (!(flags & ORANGEFS_OP_NO_MUTEX)) - mutex_unlock(&request_mutex); + mutex_unlock(&orangefs_request_mutex); ret = wait_for_matching_downcall(op, timeout, flags & ORANGEFS_OP_INTERRUPTIBLE); @@ -272,9 +272,9 @@ static void } else if (op_state_in_progress(op)) { /* op must be removed from the in progress htable */ spin_unlock(&op->lock); - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_del_init(&op->list); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); gossip_debug(GOSSIP_WAIT_DEBUG, "Interrupted: Removed op %p" " from htable_ops_in_progress\n", diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 43fdc2765aea..db37a0e02d32 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -105,6 +105,13 @@ retry: goto retry; } + error = security_inode_copy_up_xattr(name); + if (error < 0 && error != -EOPNOTSUPP) + break; + if (error == 1) { + error = 0; + continue; /* Discard */ + } error = vfs_setxattr(new, name, value, size, 0); if (error) break; @@ -248,6 +255,8 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *upper = NULL; umode_t mode = stat->mode; int err; + const struct cred *old_creds = NULL; + struct cred *new_creds = NULL; newdentry = ovl_lookup_temp(workdir, dentry); err = PTR_ERR(newdentry); @@ -260,10 +269,23 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (IS_ERR(upper)) goto out1; + err = security_inode_copy_up(dentry, &new_creds); + if (err < 0) + goto out2; + + if (new_creds) + old_creds = override_creds(new_creds); + /* Can't properly set mode on creation because of the umask */ stat->mode &= S_IFMT; err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); stat->mode = mode; + + if (new_creds) { + revert_creds(old_creds); + put_cred(new_creds); + } + if (err) goto out2; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 1560fdc09a5f..b0ffa1d1677e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -489,6 +489,15 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, if (override_cred) { override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; + if (!hardlink) { + err = security_dentry_create_files_as(dentry, + stat->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; + } + } put_cred(override_creds(override_cred)); put_cred(override_cred); @@ -499,6 +508,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, err = ovl_create_over_whiteout(dentry, inode, stat, link, hardlink); } +out_revert_creds: revert_creds(old_cred); if (!err) { struct inode *realinode = d_inode(ovl_dentry_upper(dentry)); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a4585f961bf9..e2a94a26767b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -835,11 +835,11 @@ retry: goto out_dput; err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); - if (err && err != -ENODATA) + if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); - if (err && err != -ENODATA) + if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ diff --git a/fs/pnode.c b/fs/pnode.c index 99899705b105..234a9ac49958 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -259,7 +259,7 @@ static int propagate_one(struct mount *m) read_sequnlock_excl(&mount_lock); } hlist_add_head(&child->mnt_hash, list); - return 0; + return count_mounts(m->mnt_ns, child); } /* diff --git a/fs/pnode.h b/fs/pnode.h index 0fcdbe7ca648..550f5a8b4fcf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); +int count_mounts(struct mnt_namespace *ns, struct mount *mnt); #endif /* _LINUX_PNODE_H */ diff --git a/fs/proc/base.c b/fs/proc/base.c index ac0df4dde823..3b792ab3c0dc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -483,7 +483,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pS\n", + seq_printf(m, "[<%pK>] %pB\n", (void *)entries[i], (void *)entries[i]); } unlock_trace(task); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index c633476616e0..bca66d83a765 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -390,6 +390,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); + proc_set_user(ent, (*parent)->uid, (*parent)->gid); + out: return ent; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index a939f5ed7f89..5c89a07e3d7f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { + char *buf = file->private_data; ssize_t acc = 0; size_t size, tsz; size_t elf_buflen; @@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (clear_user(buffer, tsz)) return -EFAULT; } else if (is_vmalloc_or_module_addr((void *)start)) { - char * elf_buf; - - elf_buf = kzalloc(tsz, GFP_KERNEL); - if (!elf_buf) - return -ENOMEM; - vread(elf_buf, (char *)start, tsz); + vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, elf_buf, tsz)) { - kfree(elf_buf); + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; - } - kfree(elf_buf); } else { if (kern_addr_valid(start)) { unsigned long n; - n = copy_to_user(buffer, (char *)start, tsz); + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + memcpy(buf, (char *) start, tsz); + n = copy_to_user(buffer, buf, tsz); /* * We cannot distinguish between fault on source * and fault on destination. When this happens @@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; + + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .release = release_kcore, .llseek = default_llseek, }; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index c8bbc68cdb05..7ae6b1da7cab 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -21,6 +21,7 @@ #include <linux/bitops.h> #include <linux/mount.h> #include <linux/nsproxy.h> +#include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> @@ -185,6 +186,8 @@ const struct file_operations proc_net_operations = { static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; + kuid_t uid; + kgid_t gid; int err; err = -ENOMEM; @@ -199,6 +202,16 @@ static __net_init int proc_net_ns_init(struct net *net) netd->parent = &proc_root; memcpy(netd->name, "net", 4); + uid = make_kuid(net->user_ns, 0); + if (!uid_valid(uid)) + uid = netd->uid; + + gid = make_kgid(net->user_ns, 0); + if (!gid_valid(gid)) + gid = netd->gid; + + proc_set_user(netd, uid, gid); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1b93650dda2f..71025b9e2a4e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry, struct nsproxy *namespaces); + struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); @@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head) } static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) - set = root->lookup(root, namespaces); + set = root->lookup(root); return set; } @@ -430,6 +430,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { + struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; @@ -457,6 +458,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (is_empty_dir(head)) make_empty_dir_inode(inode); } + + if (root->set_ownership) + root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + out: return inode; } @@ -491,7 +496,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; if (S_ISLNK(p->mode)) { - ret = sysctl_follow_link(&h, &p, current->nsproxy); + ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; @@ -659,7 +664,7 @@ static bool proc_sys_link_fill_cache(struct file *file, if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table, current->nsproxy); + int err = sysctl_follow_link(&head, &table); if (err) goto out; } @@ -976,7 +981,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) } static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry, struct nsproxy *namespaces) + struct ctl_table **pentry) { struct ctl_table_header *head; struct ctl_table_root *root; @@ -988,7 +993,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; - set = lookup_header_set(root, namespaces); + set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187d84ef9de9..f6fa99eca515 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, mss->anonymous_thp += HPAGE_PMD_SIZE; else if (PageSwapBacked(page)) mss->shmem_thp += HPAGE_PMD_SIZE; + else if (is_zone_device_page(page)) + /* pass */; else VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 35df08ee9c97..2d445425aad7 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -341,6 +341,7 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) struct qc_state state; int ret; + memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; @@ -365,17 +366,19 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; - if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + + /* Inodes may be allocated even if inactive; copy out if present */ + if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } - if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } - if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[PRJQUOTA].ino) { /* * Q_XGETQSTAT doesn't have room for both group and project * quotas. So, allow the project quota values to be copied out @@ -411,6 +414,7 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) struct qc_state state; int ret; + memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; @@ -435,17 +439,19 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; - if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + + /* Inodes may be allocated even if inactive; copy out if present */ + if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } - if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } - if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[PRJQUOTA].ino) { fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino; fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 183a212694bf..12af0490322f 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -27,9 +27,17 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/ramfs.h> +#include <linux/sched.h> #include "internal.h" +static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, @@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, + .get_unmapped_area = ramfs_mmu_get_unmapped_area, }; const struct inode_operations ramfs_file_inode_operations = { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7a4a85a6821e..74d5ddd26296 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -190,7 +190,15 @@ static int remove_save_link_only(struct super_block *s, static int reiserfs_quota_on_mount(struct super_block *, int); #endif -/* look for uncompleted unlinks and truncates and complete them */ +/* + * Look for uncompleted unlinks and truncates and complete them + * + * Called with superblock write locked. If quotas are enabled, we have to + * release/retake lest we call dquot_quota_on_mount(), proceed to + * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per + * cpu worklets to complete flush_async_commits() that in turn wait for the + * superblock write lock. + */ static int finish_unfinished(struct super_block *s) { INITIALIZE_PATH(path); @@ -237,7 +245,9 @@ static int finish_unfinished(struct super_block *s) quota_enabled[i] = 0; continue; } + reiserfs_write_unlock(s); ret = reiserfs_quota_on_mount(s, i); + reiserfs_write_lock(s); if (ret < 0) reiserfs_warning(s, "reiserfs-2500", "cannot turn on journaled " diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index dc1358b5ec95..ac2de0ed69ad 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -233,8 +233,8 @@ void sysfs_remove_group(struct kobject *kobj, kn = kernfs_find_and_get(parent, grp->name); if (!kn) { WARN(!kn, KERN_WARNING - "sysfs group %p not found for kobject '%s'\n", - grp, kobject_name(kobj)); + "sysfs group '%s' not found for kobject '%s'\n", + grp->name, kobject_name(kobj)); return; } } else { diff --git a/fs/udf/file.c b/fs/udf/file.c index 632570617327..e855bf8d74b4 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -94,7 +94,7 @@ static int udf_adinicb_write_begin(struct file *file, return -ENOMEM; *pagep = page; - if (!PageUptodate(page) && len != PAGE_SIZE) + if (!PageUptodate(page)) __udf_adinicb_readpage(page); return 0; } @@ -105,11 +105,25 @@ static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; } +static int udf_adinicb_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + set_page_dirty(page); + unlock_page(page); + put_page(page); + return copied; +} + const struct address_space_operations udf_adinicb_aops = { .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, - .write_end = simple_write_end, + .write_end = udf_adinicb_write_end, .direct_IO = udf_adinicb_direct_IO, }; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index fc593c869493..584e87e11cb6 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -52,6 +52,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ + xfs_ag_resv.o \ xfs_rmap.o \ xfs_rmap_btree.o \ xfs_sb.o \ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c new file mode 100644 index 000000000000..e3ae0f2b4294 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -0,0 +1,325 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_rmap_btree.h" +#include "xfs_btree.h" + +/* + * Per-AG Block Reservations + * + * For some kinds of allocation group metadata structures, it is advantageous + * to reserve a small number of blocks in each AG so that future expansions of + * that data structure do not encounter ENOSPC because errors during a btree + * split cause the filesystem to go offline. + * + * Prior to the introduction of reflink, this wasn't an issue because the free + * space btrees maintain a reserve of space (the AGFL) to handle any expansion + * that may be necessary; and allocations of other metadata (inodes, BMBT, + * dir/attr) aren't restricted to a single AG. However, with reflink it is + * possible to allocate all the space in an AG, have subsequent reflink/CoW + * activity expand the refcount btree, and discover that there's no space left + * to handle that expansion. Since we can calculate the maximum size of the + * refcount btree, we can reserve space for it and avoid ENOSPC. + * + * Handling per-AG reservations consists of three changes to the allocator's + * behavior: First, because these reservations are always needed, we decrease + * the ag_max_usable counter to reflect the size of the AG after the reserved + * blocks are taken. Second, the reservations must be reflected in the + * fdblocks count to maintain proper accounting. Third, each AG must maintain + * its own reserved block counter so that we can calculate the amount of space + * that must remain free to maintain the reservations. Fourth, the "remaining + * reserved blocks" count must be used when calculating the length of the + * longest free extent in an AG and to clamp maxlen in the per-AG allocation + * functions. In other words, we maintain a virtual allocation via in-core + * accounting tricks so that we don't have to clean up after a crash. :) + * + * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type + * values via struct xfs_alloc_arg or directly to the xfs_free_extent + * function. It might seem a little funny to maintain a reservoir of blocks + * to feed another reservoir, but the AGFL only holds enough blocks to get + * through the next transaction. The per-AG reservation is to ensure (we + * hope) that each AG never runs out of blocks. Each data structure wanting + * to use the reservation system should update ask/used in xfs_ag_resv_init. + */ + +/* + * Are we critically low on blocks? For now we'll define that as the number + * of blocks we can get our hands on being less than 10% of what we reserved + * or less than some arbitrary number (maximum btree height). + */ +bool +xfs_ag_resv_critical( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t avail; + xfs_extlen_t orig; + + switch (type) { + case XFS_AG_RESV_METADATA: + avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved; + orig = pag->pag_meta_resv.ar_asked; + break; + case XFS_AG_RESV_AGFL: + avail = pag->pagf_freeblks + pag->pagf_flcount - + pag->pag_meta_resv.ar_reserved; + orig = pag->pag_agfl_resv.ar_asked; + break; + default: + ASSERT(0); + return false; + } + + trace_xfs_ag_resv_critical(pag, type, avail); + + /* Critically low if less than 10% or max btree height remains. */ + return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS; +} + +/* + * How many blocks are reserved but not used, and therefore must not be + * allocated away? + */ +xfs_extlen_t +xfs_ag_resv_needed( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t len; + + len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved; + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + len -= xfs_perag_resv(pag, type)->ar_reserved; + break; + case XFS_AG_RESV_NONE: + /* empty */ + break; + default: + ASSERT(0); + } + + trace_xfs_ag_resv_needed(pag, type, len); + + return len; +} + +/* Clean out a reservation */ +static int +__xfs_ag_resv_free( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t oldresv; + int error; + + trace_xfs_ag_resv_free(pag, type, 0); + + resv = xfs_perag_resv(pag, type); + pag->pag_mount->m_ag_max_usable += resv->ar_asked; + /* + * AGFL blocks are always considered "free", so whatever + * was reserved at mount time must be given back at umount. + */ + if (type == XFS_AG_RESV_AGFL) + oldresv = resv->ar_orig_reserved; + else + oldresv = resv->ar_reserved; + error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + resv->ar_reserved = 0; + resv->ar_asked = 0; + + if (error) + trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + return error; +} + +/* Free a per-AG reservation. */ +int +xfs_ag_resv_free( + struct xfs_perag *pag) +{ + int error; + int err2; + + error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL); + err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); + if (err2 && !error) + error = err2; + return error; +} + +static int +__xfs_ag_resv_init( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + xfs_extlen_t ask, + xfs_extlen_t used) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_ag_resv *resv; + int error; + + resv = xfs_perag_resv(pag, type); + if (used > ask) + ask = used; + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = ask - used; + mp->m_ag_max_usable -= ask; + + trace_xfs_ag_resv_init(pag, type, ask); + + error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); + if (error) + trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + + return error; +} + +/* Create a per-AG block reservation. */ +int +xfs_ag_resv_init( + struct xfs_perag *pag) +{ + xfs_extlen_t ask; + xfs_extlen_t used; + int error = 0; + + /* Create the metadata reservation. */ + if (pag->pag_meta_resv.ar_asked == 0) { + ask = used = 0; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } + + /* Create the AGFL metadata reservation */ + if (pag->pag_agfl_resv.ar_asked == 0) { + ask = used = 0; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); + if (error) + goto out; + } + +out: + return error; +} + +/* Allocate a block from the reservation. */ +void +xfs_ag_resv_alloc_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t len; + uint field; + + trace_xfs_ag_resv_alloc_extent(pag, type, args->len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS; + xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); + return; + } + + len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); + resv->ar_reserved -= len; + if (type == XFS_AG_RESV_AGFL) + return; + /* Allocations of reserved blocks only need on-disk sb updates... */ + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); + /* ...but non-reserved blocks need in-core and on-disk updates. */ + if (args->len > len) + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, + -((int64_t)args->len - len)); +} + +/* Free a block to the reservation. */ +void +xfs_ag_resv_free_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_trans *tp, + xfs_extlen_t len) +{ + xfs_extlen_t leftover; + struct xfs_ag_resv *resv; + + trace_xfs_ag_resv_free_extent(pag, type, len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + return; + } + + leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); + resv->ar_reserved += leftover; + if (type == XFS_AG_RESV_AGFL) + return; + /* Freeing into the reserved pool only requires on-disk update... */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); + /* ...but freeing beyond that requires in-core and on-disk update. */ + if (len > leftover) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); +} diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h new file mode 100644 index 000000000000..8d6c687deef3 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_AG_RESV_H__ +#define __XFS_AG_RESV_H__ + +int xfs_ag_resv_free(struct xfs_perag *pag); +int xfs_ag_resv_init(struct xfs_perag *pag); + +bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); +xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag, + enum xfs_ag_resv_type type); + +void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args); +void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_trans *tp, xfs_extlen_t len); + +#endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 05b5243d89f6..ca75dc90ebe0 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -37,6 +37,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag_resv.h" struct workqueue_struct *xfs_alloc_wq; @@ -74,14 +75,8 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist - * and 4 more to handle a potential split of the file's bmap btree. - * - * When rmap is enabled, we must also be able to handle two rmap btree inserts - * to record both the file data extent and a new bmbt block. The bmbt block - * might not be in the same AG as the file data extent. In the worst case - * the bmap btree splits multiple levels and all the new blocks come from - * different AGs, so set aside enough to handle rmap btree splits in all AGs. + * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a + * potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( @@ -90,8 +85,6 @@ xfs_alloc_set_aside( unsigned int blocks; blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels; return blocks; } @@ -265,7 +258,7 @@ xfs_alloc_compute_diff( xfs_agblock_t wantbno, /* target starting block */ xfs_extlen_t wantlen, /* target length */ xfs_extlen_t alignment, /* target alignment */ - char userdata, /* are we allocating data? */ + int datatype, /* are we allocating data? */ xfs_agblock_t freebno, /* freespace's starting block */ xfs_extlen_t freelen, /* freespace's length */ xfs_agblock_t *newbnop) /* result: best start block from free */ @@ -276,6 +269,7 @@ xfs_alloc_compute_diff( xfs_extlen_t newlen1=0; /* length with newbno1 */ xfs_extlen_t newlen2=0; /* length with newbno2 */ xfs_agblock_t wantend; /* end of target extent */ + bool userdata = xfs_alloc_is_userdata(datatype); ASSERT(freelen >= wantlen); freeend = freebno + freelen; @@ -680,12 +674,29 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; + xfs_extlen_t reservation; + xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); ASSERT(args->minlen <= args->maxlen); ASSERT(args->mod < args->prod); ASSERT(args->alignment > 0); + + /* + * Clamp maxlen to the amount of free space minus any reservations + * that have been made. + */ + oldmax = args->maxlen; + reservation = xfs_ag_resv_needed(args->pag, args->resv); + if (args->maxlen > args->pag->pagf_freeblks - reservation) + args->maxlen = args->pag->pagf_freeblks - reservation; + if (args->maxlen == 0) { + args->agbno = NULLAGBLOCK; + args->maxlen = oldmax; + return 0; + } + /* * Branch to correct routine based on the type. */ @@ -705,12 +716,14 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } + args->maxlen = oldmax; + if (error || args->agbno == NULLAGBLOCK) return error; ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ @@ -732,12 +745,7 @@ xfs_alloc_ag_vextent( args->agbno, args->len)); } - if (!args->isfl) { - xfs_trans_mod_sb(args->tp, args->wasdel ? - XFS_TRANS_SB_RES_FDBLOCKS : - XFS_TRANS_SB_FDBLOCKS, - -((long)(args->len))); - } + xfs_ag_resv_alloc_extent(args->pag, args->resv, args); XFS_STATS_INC(args->mp, xs_allocx); XFS_STATS_ADD(args->mp, xs_allocb, args->len); @@ -917,7 +925,7 @@ xfs_alloc_find_best_extent( sdiff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, - args->userdata, *sbnoa, + args->datatype, *sbnoa, *slena, &new); /* @@ -1101,7 +1109,7 @@ restart: if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, ltbnoa, + args->alignment, args->datatype, ltbnoa, ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { @@ -1254,7 +1262,7 @@ restart: args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, ltbnoa, + args->alignment, args->datatype, ltbnoa, ltlena, <new); error = xfs_alloc_find_best_extent(args, @@ -1271,7 +1279,7 @@ restart: args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, gtbnoa, + args->alignment, args->datatype, gtbnoa, gtlena, >new); error = xfs_alloc_find_best_extent(args, @@ -1331,7 +1339,7 @@ restart: } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - args->userdata, ltbnoa, ltlena, <new); + args->datatype, ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); @@ -1583,6 +1591,7 @@ xfs_alloc_ag_vextent_small( int *stat) /* status: 0-freelist, 1-normal/none */ { struct xfs_owner_info oinfo; + struct xfs_perag *pag; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1600,7 +1609,8 @@ xfs_alloc_ag_vextent_small( * to respect minleft even when pulling from the * freelist. */ - else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + else if (args->minlen == 1 && args->alignment == 1 && + args->resv != XFS_AG_RESV_AGFL && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1608,9 +1618,9 @@ xfs_alloc_ag_vextent_small( goto error0; if (fbno != NULLAGBLOCK) { xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, - args->userdata); + xfs_alloc_allow_busy_reuse(args->datatype)); - if (args->userdata) { + if (xfs_alloc_is_userdata(args->datatype)) { xfs_buf_t *bp; bp = xfs_btree_get_bufs(args->mp, args->tp, @@ -1629,13 +1639,18 @@ xfs_alloc_ag_vextent_small( /* * If we're feeding an AGFL block to something that * doesn't live in the free space, we need to clear - * out the OWN_AG rmap. + * out the OWN_AG rmap and add the block back to + * the AGFL per-AG reservation. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, &oinfo); if (error) goto error0; + pag = xfs_perag_get(args->mp, args->agno); + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL, + args->tp, 1); + xfs_perag_put(pag); *stat = 0; return 0; @@ -1683,7 +1698,7 @@ xfs_free_ag_extent( xfs_agblock_t bno, xfs_extlen_t len, struct xfs_owner_info *oinfo, - int isfl) + enum xfs_ag_resv_type type) { xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ @@ -1911,21 +1926,22 @@ xfs_free_ag_extent( */ pag = xfs_perag_get(mp, agno); error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); xfs_perag_put(pag); if (error) goto error0; - if (!isfl) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -1950,21 +1966,43 @@ xfs_alloc_compute_maxlevels( } /* - * Find the length of the longest extent in an AG. + * Find the length of the longest extent in an AG. The 'need' parameter + * specifies how much space we're going to need for the AGFL and the + * 'reserved' parameter tells us how many blocks in this AG are reserved for + * other callers. */ xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, struct xfs_perag *pag, - xfs_extlen_t need) + xfs_extlen_t need, + xfs_extlen_t reserved) { xfs_extlen_t delta = 0; + /* + * If the AGFL needs a recharge, we'll have to subtract that from the + * longest extent. + */ if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; + /* + * If we cannot maintain others' reservations with space from the + * not-longest freesp extents, we'll have to subtract /that/ from + * the longest extent too. + */ + if (pag->pagf_freeblks - pag->pagf_longest < reserved) + delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); + + /* + * If the longest extent is long enough to satisfy all the + * reservations and AGFL rules in place, we can return this extent. + */ if (pag->pagf_longest > delta) return pag->pagf_longest - delta; + + /* Otherwise, let the caller try for 1 block if there's space. */ return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } @@ -2004,20 +2042,24 @@ xfs_alloc_space_available( { struct xfs_perag *pag = args->pag; xfs_extlen_t longest; + xfs_extlen_t reservation; /* blocks that are still reserved */ int available; if (flags & XFS_ALLOC_FLAG_FREEING) return true; + reservation = xfs_ag_resv_needed(pag, args->resv); + /* do we have enough contiguous free space for the allocation? */ - longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, + reservation); if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) return false; - /* do have enough free space remaining for the allocation? */ + /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - min_free - args->total); - if (available < (int)args->minleft) + reservation - min_free - args->total); + if (available < (int)args->minleft || available <= 0) return false; return true; @@ -2058,7 +2100,7 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && args->userdata && + if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; @@ -2124,7 +2166,7 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, 1); + &targs.oinfo, XFS_AG_RESV_AGFL); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); @@ -2135,7 +2177,7 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); @@ -2146,6 +2188,7 @@ xfs_alloc_fix_freelist( while (pag->pagf_flcount < need) { targs.agbno = 0; targs.maxlen = need - pag->pagf_flcount; + targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ error = xfs_alloc_ag_vextent(&targs); @@ -2633,7 +2676,7 @@ xfs_alloc_vextent( * Try near allocation first, then anywhere-in-ag after * the first a.g. fails. */ - if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && + if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && (mp->m_flags & XFS_MOUNT_32BITINODES)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % @@ -2766,7 +2809,7 @@ xfs_alloc_vextent( #endif /* Zero the extent if we were asked to do so */ - if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { + if (args->datatype & XFS_ALLOC_USERDATA_ZERO) { error = xfs_zero_extent(args->ip, args->fsbno, args->len); if (error) goto error0; @@ -2825,7 +2868,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo) /* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type) /* block reservation type */ { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; @@ -2834,6 +2878,7 @@ xfs_free_extent( int error; ASSERT(len != 0); + ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT, @@ -2851,7 +2896,7 @@ xfs_free_extent( agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), err); - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) goto err; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6fe2d6b7cfe9..7c404a6b0ae3 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg { xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ + int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ - char isfl; /* set if is freelist blocks - !acctg */ - char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ + enum xfs_ag_resv_type resv; /* block reservation to use */ } xfs_alloc_arg_t; /* - * Defines for userdata + * Defines for datatype */ #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ + +static inline bool +xfs_alloc_is_userdata(int datatype) +{ + return (datatype & ~XFS_ALLOC_NOBUSY) != 0; +} + +static inline bool +xfs_alloc_allow_busy_reuse(int datatype) +{ + return (datatype & XFS_ALLOC_NOBUSY) == 0; +} /* freespace limit calculations */ #define XFS_ALLOC_AGFL_RESERVE 4 @@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, - struct xfs_perag *pag, xfs_extlen_t need); + struct xfs_perag *pag, xfs_extlen_t need, + xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); @@ -184,7 +198,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo);/* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type); /* block reservation type */ int /* error */ xfs_alloc_lookup_ge( diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b060bca93402..9d7f61d36645 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -47,6 +47,7 @@ #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -1388,7 +1389,7 @@ xfs_bmap_search_multi_extents( * Else, *lastxp will be set to the index of the found * entry; *gotp will contain the entry. */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_extents( xfs_inode_t *ip, /* incore inode pointer */ xfs_fileoff_t bno, /* block number searched for */ @@ -3347,7 +3348,8 @@ xfs_bmap_adjacent( mp = ap->ip->i_mount; nullfb = *ap->firstblock == NULLFSBLOCK; - rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; + rt = XFS_IS_REALTIME_INODE(ap->ip) && + xfs_alloc_is_userdata(ap->datatype); fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, @@ -3501,7 +3503,8 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3622,7 +3625,7 @@ xfs_bmap_btalloc( { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ - xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t align = 0; /* minimum allocation alignment */ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_agnumber_t ag; xfs_alloc_arg_t args; @@ -3645,7 +3648,8 @@ xfs_bmap_btalloc( else if (mp->m_dalign) stripe_align = mp->m_dalign; - align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; + if (xfs_alloc_is_userdata(ap->datatype)) + align = xfs_get_extsz_hint(ap->ip); if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, @@ -3658,7 +3662,8 @@ xfs_bmap_btalloc( nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + if (xfs_alloc_is_userdata(ap->datatype) && + xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); @@ -3698,7 +3703,8 @@ xfs_bmap_btalloc( * enough for the request. If one isn't found, then adjust * the minimum allocation size to the largest space found. */ - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + if (xfs_alloc_is_userdata(ap->datatype) && + xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); else error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); @@ -3781,9 +3787,9 @@ xfs_bmap_btalloc( } args.minleft = ap->minleft; args.wasdel = ap->wasdel; - args.isfl = 0; - args.userdata = ap->userdata; - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) + args.resv = XFS_AG_RESV_NONE; + args.datatype = ap->datatype; + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) args.ip = ap->ip; error = xfs_alloc_vextent(&args); @@ -3877,7 +3883,8 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) + if (XFS_IS_REALTIME_INODE(ap->ip) && + xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); return xfs_bmap_btalloc(ap); } @@ -4074,7 +4081,7 @@ xfs_bmapi_read( return 0; } -STATIC int +int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, xfs_fileoff_t aoff, @@ -4170,91 +4177,6 @@ out_unreserve_quota: return error; } -/* - * Map file blocks to filesystem blocks, adding delayed allocations as needed. - */ -int -xfs_bmapi_delay( - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - int flags) /* XFS_BMAPI_... */ -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - struct xfs_bmbt_irec got; /* current file extent record */ - struct xfs_bmbt_irec prev; /* previous file extent record */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_fileoff_t end; /* end of mapped file region */ - xfs_extnum_t lastx; /* last useful extent number */ - int eof; /* we've hit the end of extents */ - int n = 0; /* current extent index */ - int error = 0; - - ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - XFS_STATS_INC(mp, xs_blk_mapw); - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - return error; - } - - xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); - end = bno + len; - obno = bno; - - while (bno < end && n < *nmap) { - if (eof || got.br_startoff > bno) { - error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, - &prev, &lastx, eof); - if (error) { - if (n == 0) { - *nmap = 0; - return error; - } - break; - } - } - - /* set up the extent map to return. */ - xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); - xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); - - /* If we're done, stop now. */ - if (bno >= end || n >= *nmap) - break; - - /* Else go on to the next record. */ - prev = got; - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); - else - eof = 1; - } - - *nmap = n; - return 0; -} - - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4287,15 +4209,21 @@ xfs_bmapi_allocate( } /* - * Indicate if this is the first user data in the file, or just any - * user data. And if it is userdata, indicate whether it needs to - * be initialised to zero during allocation. + * Set the data type being allocated. For the data fork, the first data + * in the file is treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range is not on + * the busy list. */ if (!(bma->flags & XFS_BMAPI_METADATA)) { - bma->userdata = (bma->offset == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) - bma->userdata |= XFS_ALLOC_USERDATA_ZERO; + bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; @@ -4565,7 +4493,7 @@ xfs_bmapi_write( bma.tp = tp; bma.ip = ip; bma.total = total; - bma.userdata = 0; + bma.datatype = 0; bma.dfops = dfops; bma.firstblock = firstblock; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 254034f96941..8395f6e8cf7d 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -54,7 +54,7 @@ struct xfs_bmalloca { bool wasdel; /* replacing a delayed allocation */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ - char userdata;/* userdata mask */ + int datatype;/* data type being allocated */ int flags; }; @@ -181,9 +181,6 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); -int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, - xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_fsblock_t *firstblock, xfs_extlen_t total, @@ -202,5 +199,12 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +struct xfs_bmbt_rec_host * + xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, + int fork, int *eofp, xfs_extnum_t *lastxp, + struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); +int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff, + xfs_filblks_t len, struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 08569792fe20..aa1752f918b8 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2070,7 +2070,7 @@ __xfs_btree_updkeys( struct xfs_buf *bp0, bool force_all) { - union xfs_btree_bigkey key; /* keys from current level */ + union xfs_btree_key key; /* keys from current level */ union xfs_btree_key *lkey; /* keys from the next level up */ union xfs_btree_key *hkey; union xfs_btree_key *nlkey; /* keys from the next level up */ @@ -2086,7 +2086,7 @@ __xfs_btree_updkeys( trace_xfs_btree_updkeys(cur, level, bp0); - lkey = (union xfs_btree_key *)&key; + lkey = &key; hkey = xfs_btree_high_key_from_key(cur, lkey); xfs_btree_get_keys(cur, block, lkey); for (level++; level < cur->bc_nlevels; level++) { @@ -3226,7 +3226,7 @@ xfs_btree_insrec( struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur; /* new btree cursor */ - union xfs_btree_bigkey nkey; /* new block key */ + union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ @@ -3241,7 +3241,7 @@ xfs_btree_insrec( XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); ncur = NULL; - lkey = (union xfs_btree_key *)&nkey; + lkey = &nkey; /* * If we have an external root pointer, and we've made it to the @@ -3444,14 +3444,14 @@ xfs_btree_insert( union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ - union xfs_btree_bigkey bkey; /* key of block to insert */ + union xfs_btree_key bkey; /* key of block to insert */ union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; - key = (union xfs_btree_key *)&bkey; + key = &bkey; xfs_btree_set_ptr_null(cur, &nptr); @@ -4797,3 +4797,50 @@ xfs_btree_query_range( return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, fn, priv); } + +/* + * Calculate the number of blocks needed to store a given number of records + * in a short-format (per-AG metadata) btree. + */ +xfs_extlen_t +xfs_btree_calc_size( + struct xfs_mount *mp, + uint *limits, + unsigned long long len) +{ + int level; + int maxrecs; + xfs_extlen_t rval; + + maxrecs = limits[0]; + for (level = 0, rval = 0; len > 1; level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + maxrecs = limits[1]; + rval += len; + } + return rval; +} + +int +xfs_btree_count_blocks_helper( + struct xfs_btree_cur *cur, + int level, + void *data) +{ + xfs_extlen_t *blocks = data; + (*blocks)++; + + return 0; +} + +/* Count the blocks in a btree and return the result in *blocks. */ +int +xfs_btree_count_blocks( + struct xfs_btree_cur *cur, + xfs_extlen_t *blocks) +{ + *blocks = 0; + return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, + blocks); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 04d0865e5e6d..3f8556a5c2ad 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -37,30 +37,18 @@ union xfs_btree_ptr { __be64 l; /* long form ptr */ }; -union xfs_btree_key { - struct xfs_bmbt_key bmbt; - xfs_bmdr_key_t bmbr; /* bmbt root block */ - xfs_alloc_key_t alloc; - struct xfs_inobt_key inobt; - struct xfs_rmap_key rmap; -}; - /* - * In-core key that holds both low and high keys for overlapped btrees. - * The two keys are packed next to each other on disk, so do the same - * in memory. Preserve the existing xfs_btree_key as a single key to - * avoid the mental model breakage that would happen if we passed a - * bigkey into a function that operates on a single key. + * The in-core btree key. Overlapping btrees actually store two keys + * per pointer, so we reserve enough memory to hold both. The __*bigkey + * items should never be accessed directly. */ -union xfs_btree_bigkey { +union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; - struct { - struct xfs_rmap_key rmap; - struct xfs_rmap_key rmap_hi; - }; + struct xfs_rmap_key rmap; + struct xfs_rmap_key __rmap_bigkey[2]; }; union xfs_btree_rec { @@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, unsigned long len); +xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, + unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ @@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, void *data); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c221d0ecd52e..613c5cf19436 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -81,6 +81,10 @@ * - For each work item attached to the log intent item, * * Perform the described action. * * Attach the work item to the log done item. + * * If the result of doing the work was -EAGAIN, ->finish work + * wants a new transaction. See the "Requesting a Fresh + * Transaction while Finishing Deferred Work" section below for + * details. * * The key here is that we must log an intent item for all pending * work items every time we roll the transaction, and that we must log @@ -88,6 +92,34 @@ * we can perform complex remapping operations, chaining intent items * as needed. * + * Requesting a Fresh Transaction while Finishing Deferred Work + * + * If ->finish_item decides that it needs a fresh transaction to + * finish the work, it must ask its caller (xfs_defer_finish) for a + * continuation. The most likely cause of this circumstance are the + * refcount adjust functions deciding that they've logged enough items + * to be at risk of exceeding the transaction reservation. + * + * To get a fresh transaction, we want to log the existing log done + * item to prevent the log intent item from replaying, immediately log + * a new log intent item with the unfinished work items, roll the + * transaction, and re-call ->finish_item wherever it left off. The + * log done item and the new log intent item must be in the same + * transaction or atomicity cannot be guaranteed; defer_finish ensures + * that this happens. + * + * This requires some coordination between ->finish_item and + * defer_finish. Upon deciding to request a new transaction, + * ->finish_item should update the current work item to reflect the + * unfinished work. Next, it should reset the log done item's list + * count to the number of items finished, and return -EAGAIN. + * defer_finish sees the -EAGAIN, logs the new log intent item + * with the remaining work items, and leaves the xfs_defer_pending + * item at the head of the dop_work queue. Then it rolls the + * transaction and picks up processing where it left off. It is + * required that ->finish_item must be careful to leave enough + * transaction reservation to fit the new log intent item. + * * This is an example of remapping the extent (E, E+B) into file X at * offset A and dealing with the extent (C, C+B) already being mapped * there: @@ -104,21 +136,26 @@ * | Intent to add rmap (X, E, A, B) | * +-------------------------------------------------+ * | Reduce refcount for extent (C, B) | t2 - * | Done reducing refcount for extent (C, B) | + * | Done reducing refcount for extent (C, 9) | + * | Intent to reduce refcount for extent (C+9, B-9) | + * | (ran out of space after 9 refcount updates) | + * +-------------------------------------------------+ + * | Reduce refcount for extent (C+9, B+9) | t3 + * | Done reducing refcount for extent (C+9, B-9) | * | Increase refcount for extent (E, B) | * | Done increasing refcount for extent (E, B) | * | Intent to free extent (C, B) | * | Intent to free extent (F, 1) (refcountbt block) | * | Intent to remove rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Remove rmap (X, C, A, B) | t3 + * | Remove rmap (X, C, A, B) | t4 * | Done removing rmap (X, C, A, B) | * | Add rmap (X, E, A, B) | * | Done adding rmap (X, E, A, B) | * | Remove rmap (F, 1, REFC) | * | Done removing rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Free extent (C, B) | t4 + * | Free extent (C, B) | t5 * | Done freeing extent (C, B) | * | Free extent (D, 1) | * | Done freeing extent (D, 1) | @@ -141,6 +178,9 @@ * - Intent to free extent (C, B) * - Intent to free extent (F, 1) (refcountbt block) * - Intent to remove rmap (F, 1, REFC) + * + * Note that the continuation requested between t2 and t3 is likely to + * reoccur. */ static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; @@ -323,7 +363,16 @@ xfs_defer_finish( dfp->dfp_count--; error = dfp->dfp_type->finish_item(*tp, dop, li, dfp->dfp_done, &state); - if (error) { + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; + * put the work item back on the list + * and jump out. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + break; + } else if (error) { /* * Clean up after ourselves and jump out. * xfs_defer_cancel will take care of freeing @@ -335,9 +384,25 @@ xfs_defer_finish( goto out; } } - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction, so log a + * new log intent item to replace the old one + * and roll the transaction. See "Requesting + * a Fresh Transaction while Finishing + * Deferred Work" above. + */ + dfp->dfp_intent = dfp->dfp_type->create_intent(*tp, + dfp->dfp_count); + dfp->dfp_done = NULL; + list_for_each(li, &dfp->dfp_work) + dfp->dfp_type->log_item(*tp, dfp->dfp_intent, + li); + } else { + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); + } if (cleanup_fn) cleanup_fn(*tp, state, error); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 31ca2208c03d..eab68ae2e011 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -132,7 +132,7 @@ xfs_inobt_free_block( xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo); + &oinfo, XFS_AG_RESV_NONE); } STATIC int diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a6eed43fa7cd..fc5eef85d61e 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -647,9 +647,17 @@ struct xfs_rui_log_format { __uint16_t rui_size; /* size of this item */ __uint32_t rui_nextents; /* # extents to free */ __uint64_t rui_id; /* rui identifier */ - struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */ + struct xfs_map_extent rui_extents[]; /* array of extents to rmap */ }; +static inline size_t +xfs_rui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_rui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + /* * This is the structure used to lay out an rud log item in the * log. The rud_extents array is a variable size array whose diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7575cfc3ad15..4a28fa91e3b1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc( * Update on-disk file size now that data has been written to disk. */ STATIC int -xfs_setfilesize( +__xfs_setfilesize( struct xfs_inode *ip, struct xfs_trans *tp, xfs_off_t offset, @@ -225,6 +225,23 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +int +xfs_setfilesize( + struct xfs_inode *ip, + xfs_off_t offset, + size_t size) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) + return error; + + return __xfs_setfilesize(ip, tp, offset, size); +} + STATIC int xfs_setfilesize_ioend( struct xfs_ioend *ioend, @@ -247,7 +264,7 @@ xfs_setfilesize_ioend( return error; } - return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); + return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } /* @@ -1336,13 +1353,12 @@ xfs_end_io_direct_write( { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; uintptr_t flags = (uintptr_t)private; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; if (size <= 0) @@ -1380,14 +1396,9 @@ xfs_end_io_direct_write( error = xfs_iomap_write_unwritten(ip, offset, size); } else if (flags & XFS_DIO_FLAG_APPEND) { - struct xfs_trans *tp; - trace_xfs_end_io_direct_write_append(ip, offset, size); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, - &tp); - if (!error) - error = xfs_setfilesize(ip, tp, offset, size); + error = xfs_setfilesize(ip, offset, size); } return error; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index bf2d9a141a73..1950e3bca2ac 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -62,6 +62,7 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, ssize_t size, void *private); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4ece4f2ffc72..e827d657c314 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -182,7 +182,7 @@ xfs_bmap_rtalloc( XFS_TRANS_DQ_RTBCOUNT, (long) ralen); /* Zero the extent if we were asked to do so */ - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) { error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); if (error) return error; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e455f9098d49..2975cb2319f4 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -865,7 +865,7 @@ xfs_buf_item_log_segment( */ if (bit) { end_bit = MIN(bit + bits_to_set, (uint)NBWORD); - mask = ((1 << (end_bit - bit)) - 1) << bit; + mask = ((1U << (end_bit - bit)) - 1) << bit; *wordp |= mask; wordp++; bits_set = end_bit - bit; @@ -888,7 +888,7 @@ xfs_buf_item_log_segment( */ end_bit = bits_to_set - bits_set; if (end_bit) { - mask = (1 << end_bit) - 1; + mask = (1U << end_bit) - 1; *wordp |= mask; } } @@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error( bp->b_last_error != bp->b_error) { bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - if (cfg->retry_timeout && !bp->b_first_retry_time) + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); @@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error( if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) goto permanent_error; - if (cfg->retry_timeout && + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) goto permanent_error; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index c263e079273e..162dc186cf04 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -384,7 +384,7 @@ restart: * If this is a metadata allocation, try to reuse the busy * extent instead of trimming the allocation. */ - if (!args->userdata && + if (!xfs_alloc_is_userdata(args->datatype) && !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { if (!xfs_extent_busy_update_extent(args->mp, args->pag, busyp, fbno, flen, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e612a0233710..c68517b0f248 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -269,6 +269,8 @@ xfs_file_dio_aio_read( return -EINVAL; } + file_accessed(iocb->ki_filp); + /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file @@ -323,7 +325,6 @@ xfs_file_dio_aio_read( } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - file_accessed(iocb->ki_filp); return ret; } @@ -332,10 +333,7 @@ xfs_file_dax_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct iov_iter data = *to; + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); size_t count = iov_iter_count(to); ssize_t ret = 0; @@ -345,11 +343,7 @@ xfs_file_dax_read( return 0; /* skip atime */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } + ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); @@ -711,70 +705,32 @@ xfs_file_dax_write( struct kiocb *iocb, struct iov_iter *from) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int unaligned_io = 0; - int iolock; - struct iov_iter data; + int iolock = XFS_IOLOCK_EXCL; + ssize_t ret, error = 0; + size_t count; + loff_t pos; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { - unaligned_io = 1; - iolock = XFS_IOLOCK_EXCL; - } else if (mapping->nrpages) { - iolock = XFS_IOLOCK_EXCL; - } else { - iolock = XFS_IOLOCK_SHARED; - } xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; - /* - * Yes, even DAX files can have page cache attached to them: A zeroed - * page is inserted into the pagecache when we have to serve a write - * fault on a hole. It should never be dirtied and can simply be - * dropped from the pagecache once we get real data for the page. - * - * XXX: This is racy against mmap, and there's nothing we can do about - * it. dax_do_io() should really do this invalidation internally as - * it will know if we've allocated over a holei for this specific IO and - * if so it needs to update the mapping tree and invalidate existing - * PTEs over the newly allocated range. Remove this invalidation when - * dax_do_io() is fixed up. - */ - if (mapping->nrpages) { - loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; + pos = iocb->ki_pos; + count = iov_iter_count(from); - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - } + trace_xfs_file_dax_write(ip, count, pos); - if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + error = xfs_setfilesize(ip, pos, ret); } - trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); - - data = *from; - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - xfs_end_io_direct_write, 0); - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } out: xfs_rw_iunlock(ip, iolock); - return ret; + return error ? error : ret; } STATIC ssize_t @@ -1513,7 +1469,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); + ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1547,7 +1503,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); + ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 4a33a3304369..043ca3808ea2 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -30,6 +30,7 @@ #include "xfs_mru_cache.h" #include "xfs_filestream.h" #include "xfs_trace.h" +#include "xfs_ag_resv.h" struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; @@ -198,7 +199,8 @@ xfs_filestream_pick_ag( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || @@ -369,7 +371,8 @@ xfs_filestream_new_ag( struct xfs_mount *mp = ip->i_mount; xfs_extlen_t minlen = ap->length; xfs_agnumber_t startag = 0; - int flags, err = 0; + int flags = 0; + int err = 0; struct xfs_mru_cache_elem *mru; *agp = NULLAGNUMBER; @@ -385,8 +388,10 @@ xfs_filestream_new_ag( startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); + if (xfs_alloc_is_userdata(ap->datatype)) + flags |= XFS_PICK_USERDATA; + if (ap->dfops->dop_low) + flags |= XFS_PICK_LOWSPACE; err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 0b7f986745c1..94ac06f3d908 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -553,7 +553,7 @@ xfs_growfs_data_private( error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), - new, &oinfo); + new, &oinfo, XFS_AG_RESV_NONE); if (error) goto error0; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fb39a66914dd..65b2e3f85f52 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1414,6 +1414,16 @@ xfs_inode_set_eofblocks_tag( struct xfs_perag *pag; int tagged; + /* + * Don't bother locking the AG and looking up in the radix trees + * if we already know that we have the tag set. + */ + if (ip->i_flags & XFS_IEOFBLOCKS) + return; + spin_lock(&ip->i_flags_lock); + ip->i_flags |= XFS_IEOFBLOCKS; + spin_unlock(&ip->i_flags_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); trace_xfs_inode_set_eofblocks_tag(ip); @@ -1449,6 +1459,10 @@ xfs_inode_clear_eofblocks_tag( struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~XFS_IEOFBLOCKS; + spin_unlock(&ip->i_flags_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); trace_xfs_inode_clear_eofblocks_tag(ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e1a411e08f00..8f30d2533b48 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -216,6 +216,7 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ +#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2af0dda1c978..c08253e11545 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2016 Christoph Hellwig. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -42,17 +43,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int -xfs_iomap_eof_align_last_fsb( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_extlen_t extsize, - xfs_fileoff_t *last_fsb) +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +static xfs_extlen_t +xfs_eof_alignment( + struct xfs_inode *ip, + xfs_extlen_t extsize) { - xfs_extlen_t align = 0; - int eof, error; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t align = 0; if (!XFS_IS_REALTIME_INODE(ip)) { /* @@ -83,8 +107,21 @@ xfs_iomap_eof_align_last_fsb( align = extsize; } + return align; +} + +STATIC int +xfs_iomap_eof_align_last_fsb( + struct xfs_inode *ip, + xfs_extlen_t extsize, + xfs_fileoff_t *last_fsb) +{ + xfs_extlen_t align = xfs_eof_alignment(ip, extsize); + if (align) { xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); + int eof, error; + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); if (error) return error; @@ -154,7 +191,7 @@ xfs_iomap_write_direct( */ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & XFS_IFEXTENTS); - error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb); if (error) goto out_unlock; } else { @@ -274,130 +311,6 @@ out_trans_cancel: goto out_unlock; } -/* - * If the caller is doing a write at the end of the file, then extend the - * allocation out to the file system's write iosize. We clean up any extra - * space left over when the file is closed in xfs_inactive(). - * - * If we find we already have delalloc preallocation beyond EOF, don't do more - * preallocation as it it not needed. - */ -STATIC int -xfs_iomap_eof_want_preallocate( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *imap, - int nimaps, - int *prealloc) -{ - xfs_fileoff_t start_fsb; - xfs_filblks_t count_fsb; - int n, error, imaps; - int found_delalloc = 0; - - *prealloc = 0; - if (offset + count <= XFS_ISIZE(ip)) - return 0; - - /* - * If the file is smaller than the minimum prealloc and we are using - * dynamic preallocation, don't do any preallocation at all as it is - * likely this is the only write to the file that is going to be done. - */ - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && - XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) - return 0; - - /* - * If there are any real blocks past eof, then don't - * do any speculative allocation. - */ - start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - while (count_fsb > 0) { - imaps = nimaps; - error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, - 0); - if (error) - return error; - for (n = 0; n < imaps; n++) { - if ((imap[n].br_startblock != HOLESTARTBLOCK) && - (imap[n].br_startblock != DELAYSTARTBLOCK)) - return 0; - start_fsb += imap[n].br_blockcount; - count_fsb -= imap[n].br_blockcount; - - if (imap[n].br_startblock == DELAYSTARTBLOCK) - found_delalloc = 1; - } - } - if (!found_delalloc) - *prealloc = 1; - return 0; -} - -/* - * Determine the initial size of the preallocation. We are beyond the current - * EOF here, but we need to take into account whether this is a sparse write or - * an extending write when determining the preallocation size. Hence we need to - * look up the extent that ends at the current write offset and use the result - * to determine the preallocation size. - * - * If the extent is a hole, then preallocation is essentially disabled. - * Otherwise we take the size of the preceeding data extent as the basis for the - * preallocation size. If the size of the extent is greater than half the - * maximum extent length, then use the current offset as the basis. This ensures - * that for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width alignment of - * real extents. - */ -STATIC xfs_fsblock_t -xfs_iomap_eof_prealloc_initial_size( - struct xfs_mount *mp, - struct xfs_inode *ip, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - int nimaps) -{ - xfs_fileoff_t start_fsb; - int imaps = 1; - int error; - - ASSERT(nimaps >= imaps); - - /* if we are using a specific prealloc size, return now */ - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) - return 0; - - /* If the file is small, then use the minimum prealloc */ - if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) - return 0; - - /* - * As we write multiple pages, the offset will always align to the - * start of a page and hence point to a hole at EOF. i.e. if the size is - * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) - * will return FSB 1. Hence if there are blocks in the file, we want to - * point to the block prior to the EOF block and not the hole that maps - * directly at @offset. - */ - start_fsb = XFS_B_TO_FSB(mp, offset); - if (start_fsb) - start_fsb--; - error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); - if (error) - return 0; - - ASSERT(imaps == 1); - if (imap[0].br_startblock == HOLESTARTBLOCK) - return 0; - if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) - return imap[0].br_blockcount << 1; - return XFS_B_TO_FSB(mp, offset); -} - STATIC bool xfs_quota_need_throttle( struct xfs_inode *ip, @@ -459,27 +372,76 @@ xfs_quota_calc_throttle( } /* + * If we are doing a write at the end of the file and there are no allocations + * past this one, then extend the allocation out to the file system's write + * iosize. + * * If we don't have a user specified preallocation size, dynamically increase - * the preallocation size as the size of the file grows. Cap the maximum size + * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the * filesystem is to full, the smaller the maximum prealocation. + * + * As an exception we don't do any preallocation at all if the file is smaller + * than the minimum preallocation and we are using the default dynamic + * preallocation scheme, as it is likely this is the only write to the file that + * is going to be done. + * + * We clean up any extra space left over when the file is closed in + * xfs_inactive(). */ STATIC xfs_fsblock_t xfs_iomap_prealloc_size( - struct xfs_mount *mp, struct xfs_inode *ip, - xfs_off_t offset, - struct xfs_bmbt_irec *imap, - int nimaps) + loff_t offset, + loff_t count, + xfs_extnum_t idx, + struct xfs_bmbt_irec *prev) { - xfs_fsblock_t alloc_blocks = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; int qshift = 0; + xfs_fsblock_t alloc_blocks = 0; + + if (offset + count <= XFS_ISIZE(ip)) + return 0; - alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, - imap, nimaps); + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))) + return 0; + + /* + * If an explicit allocsize is set, the file is small, or we + * are writing behind a hole, then use the minimum prealloc: + */ + if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || + idx == 0 || + prev->br_startoff + prev->br_blockcount < offset_fsb) + return mp->m_writeio_blocks; + + /* + * Determine the initial size of the preallocation. We are beyond the + * current EOF here, but we need to take into account whether this is + * a sparse write or an extending write when determining the + * preallocation size. Hence we need to look up the extent that ends + * at the current write offset and use the result to determine the + * preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceding data extent as the basis + * for the preallocation size. If the size of the extent is greater than + * half the maximum extent length, then use the current offset as the + * basis. This ensures that for large files the preallocation size + * always extends to MAXEXTLEN rather than falling short due to things + * like stripe unit/width alignment of real extents. + */ + if (prev->br_blockcount <= (MAXEXTLEN >> 1)) + alloc_blocks = prev->br_blockcount << 1; + else + alloc_blocks = XFS_B_TO_FSB(mp, offset); if (!alloc_blocks) goto check_writeio; qblocks = alloc_blocks; @@ -550,120 +512,145 @@ xfs_iomap_prealloc_size( */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; - check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; - trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, mp->m_writeio_blocks); - return alloc_blocks; } -int -xfs_iomap_write_delay( - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *ret_imap) +static int +xfs_file_iomap_begin_delay( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t last_fsb; - xfs_off_t aligned_offset; - xfs_fileoff_t ioalign; - xfs_extlen_t extsz; - int nimaps; - xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc; - int error; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); - if (error) - return error; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t maxbytes_fsb = + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + xfs_fileoff_t end_fsb, orig_end_fsb; + int error = 0, eof = 0; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_extnum_t idx; - extsz = xfs_get_extsz_hint(ip); - offset_fsb = XFS_B_TO_FSBT(mp, offset); + ASSERT(!XFS_IS_REALTIME_INODE(ip)); + ASSERT(!xfs_get_extsz_hint(ip)); - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, - imap, XFS_WRITE_IMAPS, &prealloc); - if (error) - return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); -retry: - if (prealloc) { - xfs_fsblock_t alloc_blocks; + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_unlock; + } - alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, - XFS_WRITE_IMAPS); + XFS_STATS_INC(mp, xs_blk_mapw); - aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); - ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + alloc_blocks; - } else { - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; } - if (prealloc || extsz) { - error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); - if (error) - return error; + xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, + &got, &prev); + if (!eof && got.br_startoff <= offset_fsb) { + trace_xfs_iomap_found(ip, offset, count, 0, &got); + goto done; } + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out_unlock; + /* - * Make sure preallocation does not create extents beyond the range we - * actually support in this filesystem. + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages + * to keep the chunks of work done where somewhat symmetric with the + * work writeback does. This is a completely arbitrary number pulled + * out of thin air as a best guess for initial testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. */ - if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) - last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = orig_end_fsb = + min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (eof) { + xfs_fsblock_t prealloc_blocks; - ASSERT(last_fsb > offset_fsb); + prealloc_blocks = + xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); + if (prealloc_blocks) { + xfs_extlen_t align; + xfs_off_t end_offset; - nimaps = XFS_WRITE_IMAPS; - error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, - imap, &nimaps, XFS_BMAPI_ENTIRE); + end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); + end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; + + align = xfs_eof_alignment(ip, 0); + if (align) + end_fsb = roundup_64(end_fsb, align); + + end_fsb = min(end_fsb, maxbytes_fsb); + ASSERT(end_fsb > offset_fsb); + } + } + +retry: + error = xfs_bmapi_reserve_delalloc(ip, offset_fsb, + end_fsb - offset_fsb, &got, + &prev, &idx, eof); switch (error) { case 0: + break; case -ENOSPC: case -EDQUOT: - break; - default: - return error; - } - - /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry - * without EOF preallocation. - */ - if (nimaps == 0) { + /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc) { - prealloc = 0; - error = 0; + if (end_fsb != orig_end_fsb) { + end_fsb = orig_end_fsb; goto retry; } - return error ? error : -ENOSPC; + /*FALLTHRU*/ + default: + goto out_unlock; } - if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, &imap[0]); - /* * Tag the inode as speculatively preallocated so we can reclaim this * space on demand, if necessary. */ - if (prealloc) + if (end_fsb != orig_end_fsb) xfs_inode_set_eofblocks_tag(ip); - *ret_imap = imap[0]; - return 0; + trace_xfs_iomap_alloc(ip, offset, count, 0, &got); +done: + if (isnullstartblock(got.br_startblock)) + got.br_startblock = DELAYSTARTBLOCK; + + if (!got.br_startblock) { + error = xfs_alert_fsblock_zero(ip, &got); + if (error) + goto out_unlock; + } + + xfs_bmbt_to_iomap(ip, iomap, &got); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* @@ -947,37 +934,13 @@ error_on_bmapi_transaction: return error; } -void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); -} - -static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool imap_needs_alloc(struct inode *inode, + struct xfs_bmbt_irec *imap, int nimaps) { return !nimaps || imap->br_startblock == HOLESTARTBLOCK || - imap->br_startblock == DELAYSTARTBLOCK; + imap->br_startblock == DELAYSTARTBLOCK || + (IS_DAX(inode) && ISUNWRITTEN(imap)); } static int @@ -993,11 +956,18 @@ xfs_file_iomap_begin( struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; + unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - xfs_ilock(ip, XFS_ILOCK_EXCL); + if ((flags & IOMAP_WRITE) && + !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { + return xfs_file_iomap_begin_delay(inode, offset, length, flags, + iomap); + } + + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) @@ -1008,11 +978,11 @@ xfs_file_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; } - if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric @@ -1024,27 +994,23 @@ xfs_file_iomap_begin( * the lower level functions are updated. */ length = min_t(loff_t, length, 1024 * PAGE_SIZE); - if (xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - xfs_ilock_demote(ip, XFS_ILOCK_EXCL); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); - } else { - error = xfs_iomap_write_delay(ip, offset, length, &imap); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); if (error) return error; + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { ASSERT(nimaps); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, 0, &imap); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index fb8aca3d69ab..6498be485932 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -25,8 +25,6 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *); int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 765f084759b5..2b6eec52178e 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -413,7 +413,8 @@ struct xlog { /* log record crc error injection factor */ uint32_t l_badcrc_factor; #endif - + /* log recovery lsn tracking (for buffer submission */ + xfs_lsn_t l_recovery_lsn; }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e8638fd2c0c3..846483d56949 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -44,6 +44,7 @@ #include "xfs_error.h" #include "xfs_dir2.h" #include "xfs_rmap_item.h" +#include "xfs_buf_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -381,6 +382,15 @@ xlog_recover_iodone( SHUTDOWN_META_IO_ERROR); } } + + /* + * On v5 supers, a bli could be attached to update the metadata LSN. + * Clean it up. + */ + if (bp->b_fspriv) + xfs_buf_item_relse(bp); + ASSERT(bp->b_fspriv == NULL); + bp->b_iodone = NULL; xfs_buf_ioend(bp); } @@ -2360,12 +2370,14 @@ static void xlog_recover_validate_buf_type( struct xfs_mount *mp, struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) + xfs_buf_log_format_t *buf_f, + xfs_lsn_t current_lsn) { struct xfs_da_blkinfo *info = bp->b_addr; __uint32_t magic32; __uint16_t magic16; __uint16_t magicda; + char *warnmsg = NULL; /* * We can only do post recovery validation on items on CRC enabled @@ -2404,31 +2416,27 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_rmapbt_buf_ops; break; default: - xfs_warn(mp, "Bad btree block magic!"); - ASSERT(0); + warnmsg = "Bad btree block magic!"; break; } break; case XFS_BLFT_AGF_BUF: if (magic32 != XFS_AGF_MAGIC) { - xfs_warn(mp, "Bad AGF block magic!"); - ASSERT(0); + warnmsg = "Bad AGF block magic!"; break; } bp->b_ops = &xfs_agf_buf_ops; break; case XFS_BLFT_AGFL_BUF: if (magic32 != XFS_AGFL_MAGIC) { - xfs_warn(mp, "Bad AGFL block magic!"); - ASSERT(0); + warnmsg = "Bad AGFL block magic!"; break; } bp->b_ops = &xfs_agfl_buf_ops; break; case XFS_BLFT_AGI_BUF: if (magic32 != XFS_AGI_MAGIC) { - xfs_warn(mp, "Bad AGI block magic!"); - ASSERT(0); + warnmsg = "Bad AGI block magic!"; break; } bp->b_ops = &xfs_agi_buf_ops; @@ -2438,8 +2446,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_GDQUOT_BUF: #ifdef CONFIG_XFS_QUOTA if (magic16 != XFS_DQUOT_MAGIC) { - xfs_warn(mp, "Bad DQUOT block magic!"); - ASSERT(0); + warnmsg = "Bad DQUOT block magic!"; break; } bp->b_ops = &xfs_dquot_buf_ops; @@ -2451,16 +2458,14 @@ xlog_recover_validate_buf_type( break; case XFS_BLFT_DINO_BUF: if (magic16 != XFS_DINODE_MAGIC) { - xfs_warn(mp, "Bad INODE block magic!"); - ASSERT(0); + warnmsg = "Bad INODE block magic!"; break; } bp->b_ops = &xfs_inode_buf_ops; break; case XFS_BLFT_SYMLINK_BUF: if (magic32 != XFS_SYMLINK_MAGIC) { - xfs_warn(mp, "Bad symlink block magic!"); - ASSERT(0); + warnmsg = "Bad symlink block magic!"; break; } bp->b_ops = &xfs_symlink_buf_ops; @@ -2468,8 +2473,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_BLOCK_BUF: if (magic32 != XFS_DIR2_BLOCK_MAGIC && magic32 != XFS_DIR3_BLOCK_MAGIC) { - xfs_warn(mp, "Bad dir block magic!"); - ASSERT(0); + warnmsg = "Bad dir block magic!"; break; } bp->b_ops = &xfs_dir3_block_buf_ops; @@ -2477,8 +2481,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_DATA_BUF: if (magic32 != XFS_DIR2_DATA_MAGIC && magic32 != XFS_DIR3_DATA_MAGIC) { - xfs_warn(mp, "Bad dir data magic!"); - ASSERT(0); + warnmsg = "Bad dir data magic!"; break; } bp->b_ops = &xfs_dir3_data_buf_ops; @@ -2486,8 +2489,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_FREE_BUF: if (magic32 != XFS_DIR2_FREE_MAGIC && magic32 != XFS_DIR3_FREE_MAGIC) { - xfs_warn(mp, "Bad dir3 free magic!"); - ASSERT(0); + warnmsg = "Bad dir3 free magic!"; break; } bp->b_ops = &xfs_dir3_free_buf_ops; @@ -2495,8 +2497,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_LEAF1_BUF: if (magicda != XFS_DIR2_LEAF1_MAGIC && magicda != XFS_DIR3_LEAF1_MAGIC) { - xfs_warn(mp, "Bad dir leaf1 magic!"); - ASSERT(0); + warnmsg = "Bad dir leaf1 magic!"; break; } bp->b_ops = &xfs_dir3_leaf1_buf_ops; @@ -2504,8 +2505,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_LEAFN_BUF: if (magicda != XFS_DIR2_LEAFN_MAGIC && magicda != XFS_DIR3_LEAFN_MAGIC) { - xfs_warn(mp, "Bad dir leafn magic!"); - ASSERT(0); + warnmsg = "Bad dir leafn magic!"; break; } bp->b_ops = &xfs_dir3_leafn_buf_ops; @@ -2513,8 +2513,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DA_NODE_BUF: if (magicda != XFS_DA_NODE_MAGIC && magicda != XFS_DA3_NODE_MAGIC) { - xfs_warn(mp, "Bad da node magic!"); - ASSERT(0); + warnmsg = "Bad da node magic!"; break; } bp->b_ops = &xfs_da3_node_buf_ops; @@ -2522,24 +2521,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_ATTR_LEAF_BUF: if (magicda != XFS_ATTR_LEAF_MAGIC && magicda != XFS_ATTR3_LEAF_MAGIC) { - xfs_warn(mp, "Bad attr leaf magic!"); - ASSERT(0); + warnmsg = "Bad attr leaf magic!"; break; } bp->b_ops = &xfs_attr3_leaf_buf_ops; break; case XFS_BLFT_ATTR_RMT_BUF: if (magic32 != XFS_ATTR3_RMT_MAGIC) { - xfs_warn(mp, "Bad attr remote magic!"); - ASSERT(0); + warnmsg = "Bad attr remote magic!"; break; } bp->b_ops = &xfs_attr3_rmt_buf_ops; break; case XFS_BLFT_SB_BUF: if (magic32 != XFS_SB_MAGIC) { - xfs_warn(mp, "Bad SB block magic!"); - ASSERT(0); + warnmsg = "Bad SB block magic!"; break; } bp->b_ops = &xfs_sb_buf_ops; @@ -2556,6 +2552,40 @@ xlog_recover_validate_buf_type( xfs_blft_from_flags(buf_f)); break; } + + /* + * Nothing else to do in the case of a NULL current LSN as this means + * the buffer is more recent than the change in the log and will be + * skipped. + */ + if (current_lsn == NULLCOMMITLSN) + return; + + if (warnmsg) { + xfs_warn(mp, warnmsg); + ASSERT(0); + } + + /* + * We must update the metadata LSN of the buffer as it is written out to + * ensure that older transactions never replay over this one and corrupt + * the buffer. This can occur if log recovery is interrupted at some + * point after the current transaction completes, at which point a + * subsequent mount starts recovery from the beginning. + * + * Write verifiers update the metadata LSN from log items attached to + * the buffer. Therefore, initialize a bli purely to carry the LSN to + * the verifier. We'll clean it up in our ->iodone() callback. + */ + if (bp->b_ops) { + struct xfs_buf_log_item *bip; + + ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_item_init(bp, mp); + bip = bp->b_fspriv; + bip->bli_item.li_lsn = current_lsn; + } } /* @@ -2569,7 +2599,8 @@ xlog_recover_do_reg_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) + xfs_buf_log_format_t *buf_f, + xfs_lsn_t current_lsn) { int i; int bit; @@ -2642,7 +2673,7 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recover_validate_buf_type(mp, bp, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); } /* @@ -2685,7 +2716,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return false; - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); return true; } @@ -2773,7 +2804,8 @@ xlog_recover_buffer_pass2( */ lsn = xlog_recover_get_buf_lsn(mp, bp); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - xlog_recover_validate_buf_type(mp, bp, buf_f); + trace_xfs_log_recover_buf_skip(log, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); goto out_release; } @@ -2789,7 +2821,7 @@ xlog_recover_buffer_pass2( if (!dirty) goto out_release; } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } /* @@ -3846,14 +3878,13 @@ STATIC int xlog_recover_commit_trans( struct xlog *log, struct xlog_recover *trans, - int pass) + int pass, + struct list_head *buffer_list) { int error = 0; - int error2; int items_queued = 0; struct xlog_recover_item *item; struct xlog_recover_item *next; - LIST_HEAD (buffer_list); LIST_HEAD (ra_list); LIST_HEAD (done_list); @@ -3876,7 +3907,7 @@ xlog_recover_commit_trans( items_queued++; if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { error = xlog_recover_items_pass2(log, trans, - &buffer_list, &ra_list); + buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); items_queued = 0; } @@ -3894,15 +3925,14 @@ out: if (!list_empty(&ra_list)) { if (!error) error = xlog_recover_items_pass2(log, trans, - &buffer_list, &ra_list); + buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); } if (!list_empty(&done_list)) list_splice_init(&done_list, &trans->r_itemq); - error2 = xfs_buf_delwri_submit(&buffer_list); - return error ? error : error2; + return error; } STATIC void @@ -4085,7 +4115,8 @@ xlog_recovery_process_trans( char *dp, unsigned int len, unsigned int flags, - int pass) + int pass, + struct list_head *buffer_list) { int error = 0; bool freeit = false; @@ -4109,7 +4140,8 @@ xlog_recovery_process_trans( error = xlog_recover_add_to_cont_trans(log, trans, dp, len); break; case XLOG_COMMIT_TRANS: - error = xlog_recover_commit_trans(log, trans, pass); + error = xlog_recover_commit_trans(log, trans, pass, + buffer_list); /* success or fail, we are now done with this transaction. */ freeit = true; break; @@ -4191,10 +4223,12 @@ xlog_recover_process_ophdr( struct xlog_op_header *ohead, char *dp, char *end, - int pass) + int pass, + struct list_head *buffer_list) { struct xlog_recover *trans; unsigned int len; + int error; /* Do we understand who wrote this op? */ if (ohead->oh_clientid != XFS_TRANSACTION && @@ -4221,8 +4255,39 @@ xlog_recover_process_ophdr( return 0; } + /* + * The recovered buffer queue is drained only once we know that all + * recovery items for the current LSN have been processed. This is + * required because: + * + * - Buffer write submission updates the metadata LSN of the buffer. + * - Log recovery skips items with a metadata LSN >= the current LSN of + * the recovery item. + * - Separate recovery items against the same metadata buffer can share + * a current LSN. I.e., consider that the LSN of a recovery item is + * defined as the starting LSN of the first record in which its + * transaction appears, that a record can hold multiple transactions, + * and/or that a transaction can span multiple records. + * + * In other words, we are allowed to submit a buffer from log recovery + * once per current LSN. Otherwise, we may incorrectly skip recovery + * items and cause corruption. + * + * We don't know up front whether buffers are updated multiple times per + * LSN. Therefore, track the current LSN of each commit log record as it + * is processed and drain the queue when it changes. Use commit records + * because they are ordered correctly by the logging code. + */ + if (log->l_recovery_lsn != trans->r_lsn && + ohead->oh_flags & XLOG_COMMIT_TRANS) { + error = xfs_buf_delwri_submit(buffer_list); + if (error) + return error; + log->l_recovery_lsn = trans->r_lsn; + } + return xlog_recovery_process_trans(log, trans, dp, len, - ohead->oh_flags, pass); + ohead->oh_flags, pass, buffer_list); } /* @@ -4240,7 +4305,8 @@ xlog_recover_process_data( struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - int pass) + int pass, + struct list_head *buffer_list) { struct xlog_op_header *ohead; char *end; @@ -4254,6 +4320,7 @@ xlog_recover_process_data( if (xlog_header_check_recover(log->l_mp, rhead)) return -EIO; + trace_xfs_log_recover_record(log, rhead, pass); while ((dp < end) && num_logops) { ohead = (struct xlog_op_header *)dp; @@ -4262,7 +4329,7 @@ xlog_recover_process_data( /* errors will abort recovery */ error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, - dp, end, pass); + dp, end, pass, buffer_list); if (error) return error; @@ -4685,7 +4752,8 @@ xlog_recover_process( struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - int pass) + int pass, + struct list_head *buffer_list) { int error; __le32 crc; @@ -4732,7 +4800,8 @@ xlog_recover_process( if (error) return error; - return xlog_recover_process_data(log, rhash, rhead, dp, pass); + return xlog_recover_process_data(log, rhash, rhead, dp, pass, + buffer_list); } STATIC int @@ -4793,9 +4862,11 @@ xlog_do_recovery_pass( char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size, h_len; + int error2 = 0; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; + LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); rhead_blk = 0; @@ -4981,7 +5052,7 @@ xlog_do_recovery_pass( } error = xlog_recover_process(log, rhash, rhead, offset, - pass); + pass, &buffer_list); if (error) goto bread_err2; @@ -5012,7 +5083,8 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = xlog_recover_process(log, rhash, rhead, offset, pass); + error = xlog_recover_process(log, rhash, rhead, offset, pass, + &buffer_list); if (error) goto bread_err2; @@ -5025,10 +5097,17 @@ xlog_do_recovery_pass( bread_err1: xlog_put_bp(hbp); + /* + * Submit buffers that have been added from the last record processed, + * regardless of error status. + */ + if (!list_empty(&buffer_list)) + error2 = xfs_buf_delwri_submit(&buffer_list); + if (error && first_bad) *first_bad = rhead_blk; - return error; + return error ? error : error2; } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index faeead671f9f..56e85a6c85c7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -934,6 +934,20 @@ xfs_mountfs( } /* + * Now the log is fully replayed, we can transition to full read-only + * mode for read-only mounts. This will sync all the metadata and clean + * the log so that the recovery we just performed does not have to be + * replayed again on the next mount. + * + * We use the same quiesce mechanism as the rw->ro remount, as they are + * semantically identical operations. + */ + if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == + XFS_MOUNT_RDONLY) { + xfs_quiesce_attr(mp); + } + + /* * Complete the quota initialisation, post-log-replay component. */ if (quotamount) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b36676cde103..041d9493e798 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -57,10 +57,16 @@ enum { #define XFS_ERR_RETRY_FOREVER -1 +/* + * Although retry_timeout is in jiffies which is normally an unsigned long, + * we limit the retry timeout to 86400 seconds, or one day. So even a + * signed 32-bit long is sufficient for a HZ value up to 24855. Making it + * signed lets us store the special "-1" value, meaning retry forever. + */ struct xfs_error_cfg { struct xfs_kobj kobj; int max_retries; - unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ + long retry_timeout; /* in jiffies, -1 = infinite */ }; typedef struct xfs_mount { @@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp) } #endif +/* per-AG block reservation data structures*/ +enum xfs_ag_resv_type { + XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_AGFL, +}; + +struct xfs_ag_resv { + /* number of blocks originally reserved here */ + xfs_extlen_t ar_orig_reserved; + /* number of blocks reserved here */ + xfs_extlen_t ar_reserved; + /* number of blocks originally asked for */ + xfs_extlen_t ar_asked; +}; + /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. @@ -372,8 +394,28 @@ typedef struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; int pagb_count; /* pagb slots in use */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for just AGFL-based metadata. */ + struct xfs_ag_resv pag_agfl_resv; } xfs_perag_t; +static inline struct xfs_ag_resv * +xfs_perag_resv( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + switch (type) { + case XFS_AG_RESV_METADATA: + return &pag->pag_meta_resv; + case XFS_AG_RESV_AGFL: + return &pag->pag_agfl_resv; + default: + return NULL; + } +} + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 2500f28689d5..0432a459871c 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -51,28 +51,16 @@ xfs_rui_item_free( kmem_zone_free(xfs_rui_zone, ruip); } -/* - * This returns the number of iovecs needed to log the given rui item. - * We only need 1 iovec for an rui item. It just logs the rui_log_format - * structure. - */ -static inline int -xfs_rui_item_sizeof( - struct xfs_rui_log_item *ruip) -{ - return sizeof(struct xfs_rui_log_format) + - (ruip->rui_format.rui_nextents - 1) * - sizeof(struct xfs_map_extent); -} - STATIC void xfs_rui_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); + *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } /* @@ -97,7 +85,7 @@ xfs_rui_item_format( ruip->rui_format.rui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, - xfs_rui_item_sizeof(ruip)); + xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents)); } /* @@ -205,16 +193,12 @@ xfs_rui_init( { struct xfs_rui_log_item *ruip; - uint size; ASSERT(nextents > 0); - if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_rui_log_item) + - ((nextents - 1) * sizeof(struct xfs_map_extent))); - ruip = kmem_zalloc(size, KM_SLEEP); - } else { + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); + else ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); - } xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -239,14 +223,12 @@ xfs_rui_copy_format( uint len; src_rui_fmt = buf->i_addr; - len = sizeof(struct xfs_rui_log_format) + - (src_rui_fmt->rui_nextents - 1) * - sizeof(struct xfs_map_extent); + len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); if (buf->i_len != len) return -EFSCORRUPTED; - memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); + memcpy(dst_rui_fmt, src_rui_fmt, len); return 0; } diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index aefcc3a318a5..340c968e1f9c 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -70,6 +70,14 @@ struct xfs_rui_log_item { struct xfs_rui_log_format rui_format; }; +static inline size_t +xfs_rui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_rui_log_item, rui_format) + + xfs_rui_log_format_sizeof(nr); +} + /* * This is the "rmap update done" log item. It is used to log the fact that * some rmapbt updates mentioned in an earlier rui item have been performed. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fd6be45b3a1e..2d092f9577ca 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) * Note: xfs_log_quiesce() stops background log work - the callers must ensure * it is started again when appropriate. */ -static void +void xfs_quiesce_attr( struct xfs_mount *mp) { @@ -1782,9 +1782,8 @@ xfs_init_zones(void) if (!xfs_rud_zone) goto out_destroy_icreate_zone; - xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + - ((XFS_RUI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_map_extent))), + xfs_rui_zone = kmem_zone_init( + xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), "xfs_rui_item"); if (!xfs_rui_zone) goto out_destroy_rud_zone; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 529bce9fc37e..b6418abd85ad 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,6 +61,7 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; +extern void xfs_quiesce_attr(struct xfs_mount *mp); extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 79cfd3fc5324..5f8d55d29a11 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -393,9 +393,15 @@ max_retries_show( struct kobject *kobject, char *buf) { + int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + retries = -1; + else + retries = cfg->max_retries; + + return snprintf(buf, PAGE_SIZE, "%d\n", retries); } static ssize_t @@ -415,7 +421,10 @@ max_retries_store( if (val < -1) return -EINVAL; - cfg->max_retries = val; + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->max_retries = val; return count; } XFS_SYSFS_ATTR_RW(max_retries); @@ -425,10 +434,15 @@ retry_timeout_seconds_show( struct kobject *kobject, char *buf) { + int timeout; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%ld\n", - jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + timeout = -1; + else + timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; + + return snprintf(buf, PAGE_SIZE, "%d\n", timeout); } static ssize_t @@ -445,11 +459,16 @@ retry_timeout_seconds_store( if (ret) return ret; - /* 1 day timeout maximum */ - if (val < 0 || val > 86400) + /* 1 day timeout maximum, -1 means infinite */ + if (val < -1 || val > 86400) return -EINVAL; - cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else { + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); + } return count; } XFS_SYSFS_ATTR_RW(retry_timeout_seconds); @@ -519,18 +538,19 @@ struct xfs_error_init { static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { { .name = "default", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "EIO", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENOSPC", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENODEV", - .max_retries = 0, + .max_retries = 0, /* We can't recover from devices disappearing */ + .retry_timeout = 0, }, }; @@ -561,7 +581,10 @@ xfs_error_sysfs_init_class( goto out_error; cfg->max_retries = init[i].max_retries; - cfg->retry_timeout = msecs_to_jiffies( + if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->retry_timeout = msecs_to_jiffies( init[i].retry_timeout * MSEC_PER_SEC); } return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d303a665dba9..c6b2b1dcde75 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf, TRACE_EVENT(xfs_free_extent, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, bool isfl, int haveleft, int haveright), - TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft, + int haveright), + TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, isfl) + __field(int, resv) __field(int, haveleft) __field(int, haveright) ), @@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent, __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->isfl = isfl; + __entry->resv = resv; __entry->haveleft = haveleft; __entry->haveright = haveright; ), - TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __entry->isfl, + __entry->resv, __entry->haveleft ? (__entry->haveright ? "both" : "left") : (__entry->haveright ? "right" : "none")) @@ -1622,8 +1623,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __field(short, otype) __field(char, wasdel) __field(char, wasfromfl) - __field(char, isfl) - __field(char, userdata) + __field(int, resv) + __field(int, datatype) __field(xfs_fsblock_t, firstblock) ), TP_fast_assign( @@ -1643,14 +1644,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->otype = args->otype; __entry->wasdel = args->wasdel; __entry->wasfromfl = args->wasfromfl; - __entry->isfl = args->isfl; - __entry->userdata = args->userdata; + __entry->resv = args->resv; + __entry->datatype = args->datatype; __entry->firstblock = args->firstblock; ), TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " - "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " - "userdata %d firstblock 0x%llx", + "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " + "datatype 0x%x firstblock 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1667,8 +1668,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), __entry->wasdel, __entry->wasfromfl, - __entry->isfl, - __entry->userdata, + __entry->resv, + __entry->datatype, (unsigned long long)__entry->firstblock) ) @@ -1984,6 +1985,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover_record, + TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), + TP_ARGS(log, rhead, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + __field(int, len) + __field(int, num_logops) + __field(int, pass) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->lsn = be64_to_cpu(rhead->h_lsn); + __entry->len = be32_to_cpu(rhead->h_len); + __entry->num_logops = be32_to_cpu(rhead->h_num_logops); + __entry->pass = pass; + ), + TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn, __entry->len, __entry->num_logops, + __entry->pass) +) + DECLARE_EVENT_CLASS(xfs_log_recover_item_class, TP_PROTO(struct xlog *log, struct xlog_recover *trans, struct xlog_recover_item *item, int pass), @@ -1992,6 +2016,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __field(dev_t, dev) __field(unsigned long, item) __field(xlog_tid_t, tid) + __field(xfs_lsn_t, lsn) __field(int, type) __field(int, pass) __field(int, count) @@ -2001,15 +2026,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __entry->dev = log->l_mp->m_super->s_dev; __entry->item = (unsigned long)item; __entry->tid = trans->r_log_tid; + __entry->lsn = trans->r_lsn; __entry->type = ITEM_TYPE(item); __entry->pass = pass; __entry->count = item->ri_cnt; __entry->total = item->ri_total; ), - TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " - "item region count/total %d/%d", + TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, " + "item type %s item region count/total %d/%d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, + __entry->lsn, __entry->pass, (void *)__entry->item, __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), @@ -2068,6 +2095,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); @@ -2558,6 +2586,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* per-AG reservation */ +DECLARE_EVENT_CLASS(xfs_ag_resv_class, + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, + xfs_extlen_t len), + TP_ARGS(pag, resv, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, resv) + __field(xfs_extlen_t, freeblks) + __field(xfs_extlen_t, flcount) + __field(xfs_extlen_t, reserved) + __field(xfs_extlen_t, asked) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + struct xfs_ag_resv *r = xfs_perag_resv(pag, resv); + + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->resv = resv; + __entry->freeblks = pag->pagf_freeblks; + __entry->flcount = pag->pagf_flcount; + __entry->reserved = r ? r->ar_reserved : 0; + __entry->asked = r ? r->ar_asked : 0; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->resv, + __entry->freeblks, + __entry->flcount, + __entry->reserved, + __entry->asked, + __entry->len) +) +#define DEFINE_AG_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_ag_resv_class, name, \ + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \ + xfs_extlen_t len), \ + TP_ARGS(pag, type, len)) + +/* per-AG reservation tracepoints */ +DEFINE_AG_RESV_EVENT(xfs_ag_resv_init); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); + +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5f3d33d16e67..70f42ea86dfb 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -217,7 +217,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -318,7 +318,6 @@ xfs_trans_mod_sb( * in-core superblock's counter. This should only * be applied to the on-disk superblock. */ - ASSERT(delta < 0); tp->t_res_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) flags &= ~XFS_TRANS_SB_DIRTY; diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 459ddec137a4..ab438647592a 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -79,7 +79,8 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - error = xfs_free_extent(tp, start_block, ext_len, oinfo); + error = xfs_free_extent(tp, start_block, ext_len, oinfo, + XFS_AG_RESV_NONE); /* * Mark the transaction dirty, even on error. This ensures the diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index ea62245fee26..62900938f26d 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -147,6 +147,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ + context->seen_enough = 1; return 0; } offset = (char *)context->alist + context->count; |