summary refs log tree commit diff
path: root/kernel
diff options
context:
space:
mode:
authorNicholas Piggin <npiggin@gmail.com>2021-09-07 20:00:00 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-09-08 11:50:27 -0700
commit1e1c15839df084f4011825fee922aa976c9159dc (patch)
treeeba0d3e4771362a9032bb3637607cd41f52b2ae6 /kernel
parent4ce9f970457899defdf68e26e0502c7245002eb3 (diff)
downloadlinux-1e1c15839df084f4011825fee922aa976c9159dc.tar.gz
fs/epoll: use a per-cpu counter for user's watches count
This counter tracks the number of watches a user has, to compare against
the 'max_user_watches' limit. This causes a scalability bottleneck on
SPECjbb2015 on large systems as there is only one user. Changing to a
per-cpu counter increases throughput of the benchmark by about 30% on a
16-socket, > 1000 thread system.

[rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n]
[npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message]
  Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none
[akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter]
  Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net

Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reported-by: Anton Blanchard <anton@ozlabs.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/user.c25
1 files changed, 25 insertions, 0 deletions
diff --git a/kernel/user.c b/kernel/user.c
index c82399c1618a..e2cf8c22b539 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
 	return NULL;
 }
 
+static int user_epoll_alloc(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+	return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
+#else
+	return 0;
+#endif
+}
+
+static void user_epoll_free(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+	percpu_counter_destroy(&up->epoll_watches);
+#endif
+}
+
 /* IRQs are disabled and uidhash_lock is held upon function entry.
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
@@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
+	user_epoll_free(up);
 	kmem_cache_free(uid_cachep, up);
 }
 
@@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid)
 
 		new->uid = uid;
 		refcount_set(&new->__count, 1);
+		if (user_epoll_alloc(new)) {
+			kmem_cache_free(uid_cachep, new);
+			return NULL;
+		}
 		ratelimit_state_init(&new->ratelimit, HZ, 100);
 		ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
 
@@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
+			user_epoll_free(new);
 			kmem_cache_free(uid_cachep, new);
 		} else {
 			uid_hash_insert(new, hashent);
@@ -216,6 +238,9 @@ static int __init uid_cache_init(void)
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_HLIST_HEAD(uidhash_table + n);
 
+	if (user_epoll_alloc(&root_user))
+		panic("root_user epoll percpu counter alloc failed");
+
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
 	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));