summary refs log tree commit diff
path: root/kernel/fork.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 15:44:47 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 15:44:47 -0800
commit6a2b60b17b3e48a418695a94bd2420f6ab32e519 (patch)
tree54b7792fa68b8890f710fa6398b6ba8626a039a8 /kernel/fork.c
parent9228ff90387e276ad67b10c0eb525c9d6a57d5e9 (diff)
parent98f842e675f96ffac96e6c50315790912b2812be (diff)
downloadlinux-6a2b60b17b3e48a418695a94bd2420f6ab32e519.tar.gz
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull user namespace changes from Eric Biederman:
 "While small this set of changes is very significant with respect to
  containers in general and user namespaces in particular.  The user
  space interface is now complete.

  This set of changes adds support for unprivileged users to create user
  namespaces and as a user namespace root to create other namespaces.
  The tyranny of supporting suid root preventing unprivileged users from
  using cool new kernel features is broken.

  This set of changes completes the work on setns, adding support for
  the pid, user, mount namespaces.

  This set of changes includes a bunch of basic pid namespace
  cleanups/simplifications.  Of particular significance is the rework of
  the pid namespace cleanup so it no longer requires sending out
  tendrils into all kinds of unexpected cleanup paths for operation.  At
  least one case of broken error handling is fixed by this cleanup.

  The files under /proc/<pid>/ns/ have been converted from regular files
  to magic symlinks which prevents incorrect caching by the VFS,
  ensuring the files always refer to the namespace the process is
  currently using and ensuring that the ptrace_mayaccess permission
  checks are always applied.

  The files under /proc/<pid>/ns/ have been given stable inode numbers
  so it is now possible to see if different processes share the same
  namespaces.

  Through the David Miller's net tree are changes to relax many of the
  permission checks in the networking stack to allowing the user
  namespace root to usefully use the networking stack.  Similar changes
  for the mount namespace and the pid namespace are coming through my
  tree.

  Two small changes to add user namespace support were commited here adn
  in David Miller's -net tree so that I could complete the work on the
  /proc/<pid>/ns/ files in this tree.

  Work remains to make it safe to build user namespaces and 9p, afs,
  ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the
  Kconfig guard remains in place preventing that user namespaces from
  being built when any of those filesystems are enabled.

  Future design work remains to allow root users outside of the initial
  user namespace to mount more than just /proc and /sys."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits)
  proc: Usable inode numbers for the namespace file descriptors.
  proc: Fix the namespace inode permission checks.
  proc: Generalize proc inode allocation
  userns: Allow unprivilged mounts of proc and sysfs
  userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file
  procfs: Print task uids and gids in the userns that opened the proc file
  userns: Implement unshare of the user namespace
  userns: Implent proc namespace operations
  userns: Kill task_user_ns
  userns: Make create_new_namespaces take a user_ns parameter
  userns: Allow unprivileged use of setns.
  userns: Allow unprivileged users to create new namespaces
  userns: Allow setting a userns mapping to your current uid.
  userns: Allow chown and setgid preservation
  userns: Allow unprivileged users to create user namespaces.
  userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped
  userns: fix return value on mntns_install() failure
  vfs: Allow unprivileged manipulation of the mount namespace.
  vfs: Only support slave subtrees across different user namespaces
  vfs: Add a user namespace reference from struct mnt_namespace
  ...
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c69
1 files changed, 48 insertions, 21 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 115d6c2e4cca..c36c4e301efe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	atomic_set(&sig->live, 1);
 	atomic_set(&sig->sigcnt, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
-	if (clone_flags & CLONE_NEWPID)
-		sig->flags |= SIGNAL_UNKILLABLE;
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
@@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
 		if (thread_group_leader(p)) {
-			if (is_child_reaper(pid))
-				p->nsproxy->pid_ns->child_reaper = p;
+			if (is_child_reaper(pid)) {
+				ns_of_pid(pid)->child_reaper = p;
+				p->signal->flags |= SIGNAL_UNKILLABLE;
+			}
 
 			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1473,8 +1473,6 @@ bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
 bad_fork_cleanup_namespaces:
-	if (unlikely(clone_flags & CLONE_NEWPID))
-		pid_ns_release_proc(p->nsproxy->pid_ns);
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
@@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags,
 	 * Do some preliminary argument and permissions checking before we
 	 * actually start allocating stuff
 	 */
-	if (clone_flags & CLONE_NEWUSER) {
-		if (clone_flags & CLONE_THREAD)
+	if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+		if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
 			return -EINVAL;
-		/* hopefully this check will go away when userns support is
-		 * complete
-		 */
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-				!capable(CAP_SETGID))
-			return -EPERM;
 	}
 
 	/*
@@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
 {
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+				CLONE_NEWUSER|CLONE_NEWPID))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing to
@@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
 	struct fs_struct *fs, *new_fs = NULL;
 	struct files_struct *fd, *new_fd = NULL;
+	struct cred *new_cred = NULL;
 	struct nsproxy *new_nsproxy = NULL;
 	int do_sysvsem = 0;
 	int err;
 
-	err = check_unshare_flags(unshare_flags);
-	if (err)
-		goto bad_unshare_out;
-
+	/*
+	 * If unsharing a user namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWUSER)
+		unshare_flags |= CLONE_THREAD;
+	/*
+	 * If unsharing a pid namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWPID)
+		unshare_flags |= CLONE_THREAD;
+	/*
+	 * If unsharing a thread from a thread group, must also unshare vm.
+	 */
+	if (unshare_flags & CLONE_THREAD)
+		unshare_flags |= CLONE_VM;
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (unshare_flags & CLONE_VM)
+		unshare_flags |= CLONE_SIGHAND;
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
+
+	err = check_unshare_flags(unshare_flags);
+	if (err)
+		goto bad_unshare_out;
 	/*
 	 * CLONE_NEWIPC must also detach from the undolist: after switching
 	 * to a new ipc namespace, the semaphore arrays from the old
@@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	err = unshare_fd(unshare_flags, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
-	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+	err = unshare_userns(unshare_flags, &new_cred);
 	if (err)
 		goto bad_unshare_cleanup_fd;
+	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+					 new_cred, new_fs);
+	if (err)
+		goto bad_unshare_cleanup_cred;
 
-	if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+	if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
 		if (do_sysvsem) {
 			/*
 			 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 		}
 
 		task_unlock(current);
+
+		if (new_cred) {
+			/* Install the new user namespace */
+			commit_creds(new_cred);
+			new_cred = NULL;
+		}
 	}
 
 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);
 
+bad_unshare_cleanup_cred:
+	if (new_cred)
+		put_cred(new_cred);
 bad_unshare_cleanup_fd:
 	if (new_fd)
 		put_files_struct(new_fd);