summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--fs/eventpoll.c168
-rw-r--r--fs/file_table.c1
-rw-r--r--include/linux/eventpoll.h11
-rw-r--r--include/linux/fs.h5
4 files changed, 129 insertions, 56 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8f68095c5ce9..d89aead356df 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -215,6 +215,7 @@ struct eventpoll {
 
 	/* used to optimize loop detection check */
 	u64 gen;
+	struct hlist_head refs;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	/* used to track busy poll napi_id */
@@ -259,7 +260,45 @@ static struct kmem_cache *pwq_cache __read_mostly;
  * List of files with newly added links, where we may need to limit the number
  * of emanating paths. Protected by the epmutex.
  */
-static LIST_HEAD(tfile_check_list);
+struct epitems_head {
+	struct hlist_head epitems;
+	struct epitems_head *next;
+};
+static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
+
+static struct kmem_cache *ephead_cache __read_mostly;
+
+static inline void free_ephead(struct epitems_head *head)
+{
+	if (head)
+		kmem_cache_free(ephead_cache, head);
+}
+
+static void list_file(struct file *file)
+{
+	struct epitems_head *head;
+
+	head = container_of(file->f_ep, struct epitems_head, epitems);
+	if (!head->next) {
+		head->next = tfile_check_list;
+		tfile_check_list = head;
+	}
+}
+
+static void unlist_file(struct epitems_head *head)
+{
+	struct epitems_head *to_free = head;
+	struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
+	if (p) {
+		struct epitem *epi= container_of(p, struct epitem, fllink);
+		spin_lock(&epi->ffd.file->f_lock);
+		if (!hlist_empty(&head->epitems))
+			to_free = NULL;
+		head->next = NULL;
+		spin_unlock(&epi->ffd.file->f_lock);
+	}
+	free_ephead(to_free);
+}
 
 #ifdef CONFIG_SYSCTL
 
@@ -632,6 +671,8 @@ static void epi_rcu_free(struct rcu_head *head)
 static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 {
 	struct file *file = epi->ffd.file;
+	struct epitems_head *to_free;
+	struct hlist_head *head;
 
 	lockdep_assert_irqs_enabled();
 
@@ -642,8 +683,20 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
+	to_free = NULL;
+	head = file->f_ep;
+	if (head->first == &epi->fllink && !epi->fllink.next) {
+		file->f_ep = NULL;
+		if (!is_file_epoll(file)) {
+			struct epitems_head *v;
+			v = container_of(head, struct epitems_head, epitems);
+			if (!smp_load_acquire(&v->next))
+				to_free = v;
+		}
+	}
 	hlist_del_rcu(&epi->fllink);
 	spin_unlock(&file->f_lock);
+	free_ephead(to_free);
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
@@ -852,7 +905,11 @@ void eventpoll_release_file(struct file *file)
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
-	hlist_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
+	if (unlikely(!file->f_ep)) {
+		mutex_unlock(&epmutex);
+		return;
+	}
+	hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
 		ep = epi->ep;
 		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
@@ -1248,7 +1305,7 @@ static void path_count_init(void)
 		path_count[i] = 0;
 }
 
-static int reverse_path_check_proc(struct file *file, int depth)
+static int reverse_path_check_proc(struct hlist_head *refs, int depth)
 {
 	int error = 0;
 	struct epitem *epi;
@@ -1257,14 +1314,12 @@ static int reverse_path_check_proc(struct file *file, int depth)
 		return -1;
 
 	/* CTL_DEL can remove links here, but that can't increase our count */
-	hlist_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
-		struct file *recepient = epi->ep->file;
-		if (WARN_ON(!is_file_epoll(recepient)))
-			continue;
-		if (hlist_empty(&recepient->f_ep_links))
+	hlist_for_each_entry_rcu(epi, refs, fllink) {
+		struct hlist_head *refs = &epi->ep->refs;
+		if (hlist_empty(refs))
 			error = path_count_inc(depth);
 		else
-			error = reverse_path_check_proc(recepient, depth + 1);
+			error = reverse_path_check_proc(refs, depth + 1);
 		if (error != 0)
 			break;
 	}
@@ -1272,7 +1327,7 @@ static int reverse_path_check_proc(struct file *file, int depth)
 }
 
 /**
- * reverse_path_check - The tfile_check_list is list of file *, which have
+ * reverse_path_check - The tfile_check_list is list of epitem_head, which have
  *                      links that are proposed to be newly added. We need to
  *                      make sure that those added links don't add too many
  *                      paths such that we will spend all our time waking up
@@ -1283,19 +1338,18 @@ static int reverse_path_check_proc(struct file *file, int depth)
  */
 static int reverse_path_check(void)
 {
-	int error = 0;
-	struct file *current_file;
+	struct epitems_head *p;
 
-	/* let's call this for all tfiles */
-	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+	for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
+		int error;
 		path_count_init();
 		rcu_read_lock();
-		error = reverse_path_check_proc(current_file, 0);
+		error = reverse_path_check_proc(&p->epitems, 0);
 		rcu_read_unlock();
 		if (error)
-			break;
+			return error;
 	}
-	return error;
+	return 0;
 }
 
 static int ep_create_wakeup_source(struct epitem *epi)
@@ -1336,6 +1390,39 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
 	wakeup_source_unregister(ws);
 }
 
+static int attach_epitem(struct file *file, struct epitem *epi)
+{
+	struct epitems_head *to_free = NULL;
+	struct hlist_head *head = NULL;
+	struct eventpoll *ep = NULL;
+
+	if (is_file_epoll(file))
+		ep = file->private_data;
+
+	if (ep) {
+		head = &ep->refs;
+	} else if (!READ_ONCE(file->f_ep)) {
+allocate:
+		to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
+		if (!to_free)
+			return -ENOMEM;
+		head = &to_free->epitems;
+	}
+	spin_lock(&file->f_lock);
+	if (!file->f_ep) {
+		if (unlikely(!head)) {
+			spin_unlock(&file->f_lock);
+			goto allocate;
+		}
+		file->f_ep = head;
+		to_free = NULL;
+	}
+	hlist_add_head_rcu(&epi->fllink, file->f_ep);
+	spin_unlock(&file->f_lock);
+	free_ephead(to_free);
+	return 0;
+}
+
 /*
  * Must be called with "mtx" held.
  */
@@ -1367,19 +1454,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	epi->event = *event;
 	epi->next = EP_UNACTIVE_PTR;
 
-	atomic_long_inc(&ep->user->epoll_watches);
-
 	if (tep)
 		mutex_lock_nested(&tep->mtx, 1);
 	/* Add the current item to the list of active epoll hook for this file */
-	spin_lock(&tfile->f_lock);
-	hlist_add_head_rcu(&epi->fllink, &tfile->f_ep_links);
-	spin_unlock(&tfile->f_lock);
-	if (full_check && !tep) {
-		get_file(tfile);
-		list_add(&tfile->f_tfile_llink, &tfile_check_list);
+	if (unlikely(attach_epitem(tfile, epi) < 0)) {
+		kmem_cache_free(epi_cache, epi);
+		if (tep)
+			mutex_unlock(&tep->mtx);
+		return -ENOMEM;
 	}
 
+	if (full_check && !tep)
+		list_file(tfile);
+
+	atomic_long_inc(&ep->user->epoll_watches);
+
 	/*
 	 * Add the current item to the RB tree. All RB tree operations are
 	 * protected by "mtx", and ep_insert() is called with "mtx" held.
@@ -1813,11 +1902,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
 			 * not already there, and calling reverse_path_check()
 			 * during ep_insert().
 			 */
-			if (list_empty(&epi->ffd.file->f_tfile_llink)) {
-				if (get_file_rcu(epi->ffd.file))
-					list_add(&epi->ffd.file->f_tfile_llink,
-						 &tfile_check_list);
-			}
+			list_file(epi->ffd.file);
 		}
 	}
 	mutex_unlock(&ep->mtx);
@@ -1844,16 +1929,13 @@ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
 
 static void clear_tfile_check_list(void)
 {
-	struct file *file;
-
-	/* first clear the tfile_check_list */
-	while (!list_empty(&tfile_check_list)) {
-		file = list_first_entry(&tfile_check_list, struct file,
-					f_tfile_llink);
-		list_del_init(&file->f_tfile_llink);
-		fput(file);
+	rcu_read_lock();
+	while (tfile_check_list != EP_UNACTIVE_PTR) {
+		struct epitems_head *head = tfile_check_list;
+		tfile_check_list = head->next;
+		unlist_file(head);
 	}
-	INIT_LIST_HEAD(&tfile_check_list);
+	rcu_read_unlock();
 }
 
 /*
@@ -2003,9 +2085,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	if (error)
 		goto error_tgt_fput;
 	if (op == EPOLL_CTL_ADD) {
-		if (!hlist_empty(&f.file->f_ep_links) ||
-				ep->gen == loop_check_gen ||
-						is_file_epoll(tf.file)) {
+		if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
+		    is_file_epoll(tf.file)) {
 			mutex_unlock(&ep->mtx);
 			error = epoll_mutex_lock(&epmutex, 0, nonblock);
 			if (error)
@@ -2216,6 +2297,9 @@ static int __init eventpoll_init(void)
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
 		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
+	ephead_cache = kmem_cache_create("ep_head",
+		sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
+
 	return 0;
 }
 fs_initcall(eventpoll_init);
diff --git a/fs/file_table.c b/fs/file_table.c
index 709ada3151da..45437f8e1003 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -113,7 +113,6 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
-	eventpoll_init_file(f);
 	f->f_flags = flags;
 	f->f_mode = OPEN_FMODE(flags);
 	/* f->f_version: 0 */
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 4e215ccfa792..0350393465d4 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -22,14 +22,6 @@ struct file;
 struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
 #endif
 
-/* Used to initialize the epoll bits inside the "struct file" */
-static inline void eventpoll_init_file(struct file *file)
-{
-	INIT_HLIST_HEAD(&file->f_ep_links);
-	INIT_LIST_HEAD(&file->f_tfile_llink);
-}
-
-
 /* Used to release the epoll bits inside the "struct file" */
 void eventpoll_release_file(struct file *file);
 
@@ -50,7 +42,7 @@ static inline void eventpoll_release(struct file *file)
 	 * because the file in on the way to be removed and nobody ( but
 	 * eventpoll ) has still a reference to this file.
 	 */
-	if (likely(hlist_empty(&file->f_ep_links)))
+	if (likely(!file->f_ep))
 		return;
 
 	/*
@@ -72,7 +64,6 @@ static inline int ep_op_has_event(int op)
 
 #else
 
-static inline void eventpoll_init_file(struct file *file) {}
 static inline void eventpoll_release(struct file *file) {}
 
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b484ba0e474a..993c540e3b77 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -923,7 +923,7 @@ struct file {
 	const struct file_operations	*f_op;
 
 	/*
-	 * Protects f_ep_links, f_flags.
+	 * Protects f_ep, f_flags.
 	 * Must not be taken from IRQ context.
 	 */
 	spinlock_t		f_lock;
@@ -946,8 +946,7 @@ struct file {
 
 #ifdef CONFIG_EPOLL
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
-	struct hlist_head	f_ep_links;
-	struct list_head	f_tfile_llink;
+	struct hlist_head	*f_ep;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
 	errseq_t		f_wb_err;