diff options
Diffstat (limited to 'net/unix')
-rw-r--r-- | net/unix/Kconfig | 11 | ||||
-rw-r--r-- | net/unix/Makefile | 2 | ||||
-rw-r--r-- | net/unix/af_unix.c | 178 | ||||
-rw-r--r-- | net/unix/garbage.c | 675 | ||||
-rw-r--r-- | net/unix/scm.c | 154 | ||||
-rw-r--r-- | net/unix/scm.h | 10 |
6 files changed, 564 insertions, 466 deletions
diff --git a/net/unix/Kconfig b/net/unix/Kconfig index b7f811216820..8b5d04210d7c 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -4,7 +4,7 @@ # config UNIX - tristate "Unix domain sockets" + bool "Unix domain sockets" help If you say Y here, you will include support for Unix domain sockets; sockets are the standard Unix mechanism for establishing and @@ -14,17 +14,8 @@ config UNIX an embedded system or something similar, you therefore definitely want to say Y here. - To compile this driver as a module, choose M here: the module will be - called unix. Note that several important services won't work - correctly if you say M here and then neglect to load the module. - Say Y unless you know what you are doing. -config UNIX_SCM - bool - depends on UNIX - default y - config AF_UNIX_OOB bool depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 20491825b4d0..4ddd125c4642 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o - -obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5ce60087086c..2c33d787b860 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -116,8 +116,6 @@ #include <linux/file.h> #include <linux/btf_ids.h> -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -579,6 +577,11 @@ static void unix_sock_destructor(struct sock *sk) #endif } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); @@ -606,20 +609,23 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_state_unlock(sk); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } + u->oob_skb = NULL; #endif wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb && !unix_skb_len(skb)) + skb = skb_peek_next(skb, &sk->sk_receive_queue); +#endif unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); - if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) + if (skb || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); @@ -956,11 +962,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); - u->inflight = 0; + u->listener = NULL; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1559,6 +1565,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1652,8 +1659,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1683,6 +1690,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); @@ -1726,51 +1734,65 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + /* Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + unix_destroy_fpl(scm->fp); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); +} - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1845,8 +1867,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1854,8 +1878,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* @@ -1875,11 +1901,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2114,13 +2141,9 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other } maybe_add_creds(skb, sock, other); - skb_get(skb); - scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); - if (ousk->oob_skb) - consume_skb(ousk->oob_skb); WRITE_ONCE(ousk->oob_skb, skb); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -2145,11 +2168,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -2580,11 +2604,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } -static unsigned int unix_skb_len(const struct sk_buff *skb) -{ - return skb->len - UNIXCB(skb).consumed; -} - struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); @@ -2599,11 +2618,11 @@ struct unix_stream_read_state { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { + struct sk_buff *oob_skb, *read_skb = NULL; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; - struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); @@ -2618,10 +2637,15 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) oob_skb = u->oob_skb; - if (!(state->flags & MSG_PEEK)) + if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); - else - skb_get(oob_skb); + + if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && + !unix_skb_len(oob_skb->prev)) { + read_skb = oob_skb->prev; + __skb_unlink(read_skb, &sk->sk_receive_queue); + } + } spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); @@ -2631,10 +2655,10 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; - consume_skb(oob_skb); - mutex_unlock(&u->iolock); + consume_skb(read_skb); + if (chunk < 0) return -EFAULT; @@ -2660,12 +2684,10 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, if (copied) { skb = NULL; } else if (!(flags & MSG_PEEK)) { - if (sock_flag(sk, SOCK_URGINLINE)) { - WRITE_ONCE(u->oob_skb, NULL); - consume_skb(skb); - } else { + WRITE_ONCE(u->oob_skb, NULL); + + if (!sock_flag(sk, SOCK_URGINLINE)) { __skb_unlink(skb, &sk->sk_receive_queue); - WRITE_ONCE(u->oob_skb, NULL); unlinked_skb = skb; skb = skb_peek(&sk->sk_receive_queue); } @@ -2676,10 +2698,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, spin_unlock(&sk->sk_receive_queue.lock); - if (unlinked_skb) { - WARN_ON_ONCE(skb_unref(unlinked_skb)); - kfree_skb(unlinked_skb); - } + kfree_skb(unlinked_skb); } return skb; } @@ -2722,7 +2741,6 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) unix_state_unlock(sk); if (drop) { - WARN_ON_ONCE(skb_unref(skb)); kfree_skb(skb); return -EAGAIN; } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d2fc795394a5..0068e758be4d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,278 +81,533 @@ #include <net/scm.h> #include <net/tcp_states.h> -#include "scm.h" +struct unix_sock *unix_get_socket(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops; + struct sock *sk = sock->sk; + + ops = READ_ONCE(sock->ops); -/* Internal data structures and random procedures: */ + /* PF_UNIX ? */ + if (sk && ops && ops->family == PF_UNIX) + return unix_sk(sk); + } -static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); + return NULL; +} -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) { - struct sk_buff *skb; - struct sk_buff *next; - - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; - - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct sock *sk = unix_get_socket(*fp++); - - if (sk) { - struct unix_sock *u = unix_sk(sk); - - /* Ignore non-candidates, they could - * have been added to the queues after - * starting the garbage collection - */ - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. + */ + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + +static bool unix_graph_maybe_cyclic; +static bool unix_graph_grouped; + +static void unix_update_graph(struct unix_vertex *vertex) +{ + /* If the receiver socket is not inflight, no cyclic + * reference could be formed. + */ + if (!vertex) + return; + + unix_graph_maybe_cyclic = true; + unix_graph_grouped = false; +} + +static LIST_HEAD(unix_unvisited_vertices); + +enum unix_vertex_index { + UNIX_VERTEX_INDEX_MARK1, + UNIX_VERTEX_INDEX_MARK2, + UNIX_VERTEX_INDEX_START, +}; + +static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->index = unix_vertex_unvisited_index; + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + INIT_LIST_HEAD(&vertex->scc_entry); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; } - spin_unlock(&x->sk_receive_queue.lock); + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); + + unix_update_graph(unix_edge_successor(edge)); } -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); + struct unix_vertex *vertex = edge->predecessor->vertex; - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); + if (!fpl->dead) + unix_update_graph(unix_edge_successor(edge)); - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); + list_del(&edge->vertex_entry); + vertex->out_degree--; - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); } } -static void dec_inflight(struct unix_sock *usk) +static void unix_free_vertices(struct scm_fp_list *fpl) { - usk->inflight--; + struct unix_vertex *vertex, *next_vertex; + + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { + list_del(&vertex->entry); + kfree(vertex); + } } -static void inc_inflight(struct unix_sock *usk) +static DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; + +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { - usk->inflight++; + int i = 0, j = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; + + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + + receiver->scm_stat.nr_unix_fds += fpl->count_unix; + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); } -static void inc_inflight_move_tail(struct unix_sock *u) +void unix_del_edges(struct scm_fp_list *fpl) { - u->inflight++; + struct unix_sock *receiver; + int i = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_edge *edge = fpl->edges + i++; + + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + + if (!fpl->dead) { + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + } + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +} - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over +void unix_update_edges(struct unix_sock *receiver) +{ + /* nr_unix_fds is only updated under unix_state_lock(). + * If it's 0 here, the embryo socket is not part of the + * inflight graph, and GC will not see it, so no lock needed. */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); + if (!receiver->scm_stat.nr_unix_fds) { + receiver->listener = NULL; + } else { + spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); + } } -static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 +int unix_prepare_fpl(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex; + int i; + + if (!fpl->count_unix) + return 0; + + for (i = 0; i < fpl->count_unix; i++) { + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + if (!vertex) + goto err; + + list_add(&vertex->entry, &fpl->vertices); + } + + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), + GFP_KERNEL_ACCOUNT); + if (!fpl->edges) + goto err; -void wait_for_unix_gc(void) + return 0; + +err: + unix_free_vertices(fpl); + return -ENOMEM; +} + +void unix_destroy_fpl(struct scm_fp_list *fpl) { - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight() and gc_in_progress(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); - wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); + if (fpl->inflight) + unix_del_edges(fpl); + + kvfree(fpl->edges); + unix_free_vertices(fpl); } -/* The external entry point: unix_gc() */ -void unix_gc(void) +static bool unix_vertex_dead(struct unix_vertex *vertex) { - struct sk_buff *next_skb, *skb; + struct unix_edge *edge; struct unix_sock *u; - struct unix_sock *next; - struct sk_buff_head hitlist; - struct list_head cursor; - LIST_HEAD(not_cycle_list); + long total_ref; - spin_lock(&unix_gc_lock); + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); - /* Avoid a recursive GC. */ - if (gc_in_progress) - goto out; + /* The vertex's fd can be received by a non-inflight socket. */ + if (!next_vertex) + return false; - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, true); + /* The vertex's fd can be received by an inflight socket in + * another SCC. + */ + if (next_vertex->scc_index != vertex->scc_index) + return false; + } - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - * - * Embryos, though never candidates themselves, affect which - * candidates are reachable by the garbage collector. Before - * being added to a listener's queue, an embryo may already - * receive data carrying SCM_RIGHTS, potentially making the - * passed socket a candidate that is not yet reachable by the - * collector. It becomes reachable once the embryo is - * enqueued. Therefore, we must ensure that no SCM-laden - * embryo appears in a (candidate) listener's queue between - * consecutive scan_children() calls. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - struct sock *sk = &u->sk; - long total_refs; - - total_refs = file_count(sk->sk_socket->file); - - BUG_ON(!u->inflight); - BUG_ON(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - - if (sk->sk_state == TCP_LISTEN) { - unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); - unix_state_unlock(sk); + /* No receiver exists out of the same SCC. */ + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + total_ref = file_count(u->sk.sk_socket->file); + + /* If not close()d, total_ref > out_degree. */ + if (total_ref != vertex->out_degree) + return false; + + return true; +} + +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) +{ + struct unix_vertex *vertex; + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + spin_lock(&embryo_queue->lock); + skb_queue_splice_init(embryo_queue, hitlist); + spin_unlock(&embryo_queue->lock); } + } else { + skb_queue_splice_init(queue, hitlist); } + + spin_unlock(&queue->lock); } +} - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); +static bool unix_scc_cyclic(struct list_head *scc) +{ + struct unix_vertex *vertex; + struct unix_edge *edge; - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. + /* SCC containing multiple vertices ? */ + if (!list_is_singular(scc)) + return true; + + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); + + /* Self-reference or a embryo-listener circle ? */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + if (unix_edge_successor(edge) == vertex) + return true; + } + + return false; +} + +static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) +{ + LIST_HEAD(vertex_stack); + struct unix_edge *edge; + LIST_HEAD(edge_stack); + +next_vertex: + /* Push vertex to vertex_stack and mark it as on-stack + * (index >= UNIX_VERTEX_INDEX_START). + * The vertex will be popped when finalising SCC later. */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); + list_add(&vertex->scc_entry, &vertex_stack); - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); + vertex->index = *last_index; + vertex->scc_index = *last_index; + (*last_index)++; - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + if (!next_vertex) + continue; + + if (next_vertex->index == unix_vertex_unvisited_index) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set + * the successor to vertex for the next iteration. + */ + list_add(&edge->stack_entry, &edge_stack); + + vertex = next_vertex; + goto next_vertex; + + /* 2. Pop the edge directed to the current vertex + * and restore the ancestor for backtracking. + */ +prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + + next_vertex = vertex; + vertex = edge->predecessor->vertex; + + /* If the successor has a smaller scc_index, two vertices + * are in the same SCC, so propagate the smaller scc_index + * to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else if (next_vertex->index != unix_vertex_grouped_index) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are in + * the same SCC. If the successor has a smaller *scc_index*, + * propagate it to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else { + /* The successor was already grouped as another SCC */ } } - list_del(&cursor); - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); - -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; + if (vertex->index == vertex->scc_index) { + struct unix_vertex *v; + struct list_head scc; + bool scc_dead = true; + + /* SCC finalised. + * + * If the scc_index was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + + list_for_each_entry_reverse(v, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&v->entry, &unix_visited_vertices); + + /* Mark vertex as off-stack. */ + v->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(v); } -#endif + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); } - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) + goto prev_vertex; +} + +static void unix_walk_scc(struct sk_buff_head *hitlist) +{ + unsigned long last_index = UNIX_VERTEX_INDEX_START; + + unix_graph_maybe_cyclic = false; + + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex, &last_index, hitlist); } - spin_unlock(&unix_gc_lock); + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); - /* We need io_uring to clean its registered files, ignore all io_uring - * originated skbs. It's fine as io_uring doesn't keep references to - * other io_uring instances and so killing all other files in the cycle - * will put all io_uring references forcing it to go through normal - * release.path eventually putting registered files. - */ - skb_queue_walk_safe(&hitlist, skb, next_skb) { - if (skb->scm_io_uring) { - __skb_unlink(skb, &hitlist); - skb_queue_tail(&skb->sk->sk_receive_queue, skb); + unix_graph_grouped = true; +} + +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) +{ + unix_graph_maybe_cyclic = false; + + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; + bool scc_dead = true; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_move_tail(&vertex->entry, &unix_visited_vertices); + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); } + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); } - /* Here we are. Hitlist is filled. Die. */ - __skb_queue_purge(&hitlist); + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); +} + +static bool gc_in_progress; + +static void __unix_gc(struct work_struct *work) +{ + struct sk_buff_head hitlist; + struct sk_buff *skb; spin_lock(&unix_gc_lock); - /* There could be io_uring registered files, just push them back to - * the inflight list - */ - list_for_each_entry_safe(u, next, &gc_candidates, link) - list_move_tail(&u->link, &gc_inflight_list); + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); + goto skip_gc; + } + + __skb_queue_head_init(&hitlist); + + if (unix_graph_grouped) + unix_walk_scc_fast(&hitlist); + else + unix_walk_scc(&hitlist); + + spin_unlock(&unix_gc_lock); - /* All candidates should have been detached by now. */ - BUG_ON(!list_empty(&gc_candidates)); + skb_queue_walk(&hitlist, skb) { + if (UNIXCB(skb).fp) + UNIXCB(skb).fp->dead = true; + } - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + __skb_queue_purge(&hitlist); +skip_gc: WRITE_ONCE(gc_in_progress, false); +} - wake_up(&unix_gc_wait); +static DECLARE_WORK(unix_gc_work, __unix_gc); - out: - spin_unlock(&unix_gc_lock); +void unix_gc(void) +{ + WRITE_ONCE(gc_in_progress, true); + queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) + +void wait_for_unix_gc(struct scm_fp_list *fpl) +{ + /* If number of inflight sockets is insane, + * force a garbage collect right now. + * + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight(), and __unix_gc(). + */ + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) + unix_gc(); + + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); } diff --git a/net/unix/scm.c b/net/unix/scm.c deleted file mode 100644 index 4eff7da9f6f9..000000000000 --- a/net/unix/scm.c +++ /dev/null @@ -1,154 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <linux/fs.h> -#include <net/af_unix.h> -#include <net/scm.h> -#include <linux/init.h> -#include <linux/io_uring.h> - -#include "scm.h" - -unsigned int unix_tot_inflight; -EXPORT_SYMBOL(unix_tot_inflight); - -LIST_HEAD(gc_inflight_list); -EXPORT_SYMBOL(gc_inflight_list); - -DEFINE_SPINLOCK(unix_gc_lock); -EXPORT_SYMBOL(unix_gc_lock); - -struct sock *unix_get_socket(struct file *filp) -{ - struct sock *u_sock = NULL; - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && sock->ops && sock->ops->family == PF_UNIX) - u_sock = s; - } - - return u_sock; -} -EXPORT_SYMBOL(unix_get_socket); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - if (!u->inflight) { - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - BUG_ON(list_empty(&u->link)); - } - u->inflight++; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - BUG_ON(!u->inflight); - BUG_ON(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return 0; -} -EXPORT_SYMBOL(unix_attach_fds); - -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} -EXPORT_SYMBOL(unix_detach_fds); - -void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} -EXPORT_SYMBOL(unix_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h deleted file mode 100644 index 5a255a477f16..000000000000 --- a/net/unix/scm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef NET_UNIX_SCM_H -#define NET_UNIX_SCM_H - -extern struct list_head gc_inflight_list; -extern spinlock_t unix_gc_lock; - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); - -#endif |