From 74d46d6b2d23d44d72c37df4c6a5d2e782f7b088 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Jul 2009 17:11:50 +0900 Subject: percpu, sparc64: fix sparse possible cpu map handling percpu code has been assuming num_possible_cpus() == nr_cpu_ids which is incorrect if cpu_possible_map contains holes. This causes percpu code to access beyond allocated memories and vmalloc areas. On a sparc64 machine with cpus 0 and 2 (u60), this triggers the following warning or fails boot. WARNING: at /devel/tj/os/work/mm/vmalloc.c:106 vmap_page_range_noflush+0x1f0/0x240() Modules linked in: Call Trace: [00000000004b17d0] vmap_page_range_noflush+0x1f0/0x240 [00000000004b1840] map_vm_area+0x20/0x60 [00000000004b1950] __vmalloc_area_node+0xd0/0x160 [0000000000593434] deflate_init+0x14/0xe0 [0000000000583b94] __crypto_alloc_tfm+0xd4/0x1e0 [00000000005844f0] crypto_alloc_base+0x50/0xa0 [000000000058b898] alg_test_comp+0x18/0x80 [000000000058dad4] alg_test+0x54/0x180 [000000000058af00] cryptomgr_test+0x40/0x60 [0000000000473098] kthread+0x58/0x80 [000000000042b590] kernel_thread+0x30/0x60 [0000000000472fd0] kthreadd+0xf0/0x160 ---[ end trace 429b268a213317ba ]--- This patch fixes generic percpu functions and sparc64 setup_per_cpu_areas() so that they handle sparse cpu_possible_map properly. Please note that on x86, cpu_possible_map() doesn't contain holes and thus num_possible_cpus() == nr_cpu_ids and this patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: David S. Miller Cc: Ingo Molnar --- mm/percpu.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd8853..e0be1146f617 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,12 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of nr_cpu_ids units and the first chunk is used + * for static percpu variables in the kernel image (special boot time + * alloc/init handling necessary as these areas need to be brought up + * before allocation services are running). Unit grows as necessary + * and all units grow or shrink in unison. When a chunk is filled up, + * another chunk is allocated. ie. in vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -558,7 +558,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, bool flush_tlb) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; /* unmap must not be done on immutable chunk */ @@ -643,7 +643,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; int err; @@ -1067,9 +1067,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1248,7 +1248,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } else pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - chunk_size = pcpue_unit_size * num_possible_cpus(); + chunk_size = pcpue_unit_size * nr_cpu_ids; pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); @@ -1259,12 +1259,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /* return the leftover and copy */ - for_each_possible_cpu(cpu) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + if (cpu_possible(cpu)) { + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } else + free_bootmem(__pa(ptr), pcpue_unit_size); } /* we're ready, commit */ -- cgit 1.4.1 From 142d44b0dd6741a64a7bdbe029110e7c1dcf1d23 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 13 Aug 2009 02:00:13 -0400 Subject: percpu: use the right flag for get_vm_area() get_vm_area() only accepts VM_* flags, not GFP_*. And according to the doc of get_vm_area(), here should be VM_ALLOC. Signed-off-by: WANG Cong Acked-by: Tejun Heo Cc: Ingo Molnar --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index e0be1146f617..5fe37842e0ea 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -749,7 +749,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); if (!chunk->vm) { free_pcpu_chunk(chunk); return NULL; -- cgit 1.4.1 From 788084aba2ab7348257597496befcbccabdc98a3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:54:11 -0400 Subject: Security/SELinux: seperate lsm specific mmap_min_addr Currently SELinux enforcement of controls on the ability to map low memory is determined by the mmap_min_addr tunable. This patch causes SELinux to ignore the tunable and instead use a seperate Kconfig option specific to how much space the LSM should protect. The tunable will now only control the need for CAP_SYS_RAWIO and SELinux permissions will always protect the amount of low memory designated by CONFIG_LSM_MMAP_MIN_ADDR. This allows users who need to disable the mmap_min_addr controls (usual reason being they run WINE as a non-root user) to do so and still have SELinux controls preventing confined domains (like a web server) from being able to map some area of low memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/mm.h | 15 --------------- include/linux/security.h | 17 +++++++++++++++++ kernel/sysctl.c | 7 ++++--- mm/Kconfig | 6 +++--- mm/mmap.c | 3 --- mm/nommu.c | 3 --- security/Kconfig | 16 ++++++++++++++++ security/Makefile | 2 +- security/commoncap.c | 2 +- security/min_addr.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ security/selinux/hooks.c | 2 +- 11 files changed, 92 insertions(+), 30 deletions(-) create mode 100644 security/min_addr.c (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index ba3a7cb1eaa0..9a72cc78e6b8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,8 +34,6 @@ extern int sysctl_legacy_va_layout; #define sysctl_legacy_va_layout 0 #endif -extern unsigned long mmap_min_addr; - #include #include #include @@ -574,19 +572,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, set_page_section(page, pfn_to_section_nr(pfn)); } -/* - * If a hint addr is less than mmap_min_addr change hint to be as - * low as possible but still greater than mmap_min_addr - */ -static inline unsigned long round_hint_to_min(unsigned long hint) -{ - hint &= PAGE_MASK; - if (((void *)hint != NULL) && - (hint < mmap_min_addr)) - return PAGE_ALIGN(mmap_min_addr); - return hint; -} - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/security.h b/include/linux/security.h index ac4bc3760b46..dc3472c1f781 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -28,6 +28,7 @@ #include #include #include +#include /* PAGE_ALIGN */ #include #include #include @@ -95,6 +96,7 @@ extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; /* * Values used in the task_security_ops calls */ @@ -147,6 +149,21 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) opts->num_mnt_opts = 0; } +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + +extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); /** * struct security_operations - main security structure * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e02328c67d..58be76017fd0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", - .data = &mmap_min_addr, - .maxlen = sizeof(unsigned long), + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bde..fe5f674d7a7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd5..8101de490c73 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 53cab10fece4..28754c40be98 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); diff --git a/security/Kconfig b/security/Kconfig index d23c839038f0..9c60c346a91d 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -113,6 +113,22 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. +config LSM_MMAP_MIN_ADDR + int "Low address space for LSM to from user allocation" + depends on SECURITY && SECURITY_SELINUX + default 65535 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality or have some need to map + this low address space will need the permission specific to the + systems running LSM. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig diff --git a/security/Makefile b/security/Makefile index c67557cdaa85..b56e7f9ecbc2 100644 --- a/security/Makefile +++ b/security/Makefile @@ -8,7 +8,7 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack subdir-$(CONFIG_SECURITY_TOMOYO) += tomoyo # always enable default capabilities -obj-y += commoncap.o +obj-y += commoncap.o min_addr.o # Object file lists obj-$(CONFIG_SECURITY) += security.o capability.o diff --git a/security/commoncap.c b/security/commoncap.c index 6bcf6e81e547..e3097c0a1311 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1005,7 +1005,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot, { int ret = 0; - if (addr < mmap_min_addr) { + if (addr < dac_mmap_min_addr) { ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO, SECURITY_CAP_AUDIT); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ diff --git a/security/min_addr.c b/security/min_addr.c new file mode 100644 index 000000000000..14cc7b3b8d03 --- /dev/null +++ b/security/min_addr.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include + +/* amount of vm to protect from userspace access by both DAC and the LSM*/ +unsigned long mmap_min_addr; +/* amount of vm to protect from userspace using CAP_SYS_RAWIO (DAC) */ +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +/* amount of vm to protect from userspace using the LSM = CONFIG_LSM_MMAP_MIN_ADDR */ + +/* + * Update mmap_min_addr = max(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR) + */ +static void update_mmap_min_addr(void) +{ +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + mmap_min_addr = dac_mmap_min_addr; + else + mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + mmap_min_addr = dac_mmap_min_addr; +#endif +} + +/* + * sysctl handler which just sets dac_mmap_min_addr = the new value and then + * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly + */ +int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + update_mmap_min_addr(); + + return ret; +} + +int __init init_mmap_min_addr(void) +{ + update_mmap_min_addr(); + + return 0; +} +pure_initcall(init_mmap_min_addr); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e6d1432b0800..8d8b69c5664e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3036,7 +3036,7 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot, * at bad behaviour/exploit that we always want to get the AVC, even * if DAC would have also denied the operation. */ - if (addr < mmap_min_addr) { + if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); if (rc) -- cgit 1.4.1 From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 15 +++------ fs/proc/base.c | 19 ++--------- include/linux/mm_types.h | 2 -- include/linux/sched.h | 1 + kernel/fork.c | 1 - mm/oom_kill.c | 64 +++++++++++++++++++++++--------------- 6 files changed, 48 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fad18f9456e4..ffead13f9443 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1167,13 +1167,11 @@ CHAPTER 3: PER-PROCESS PARAMETERS 3.1 /proc//oom_adj - Adjust the oom-killer score ------------------------------------------------------ -This file can be used to adjust the score used to select which processes should -be killed in an out-of-memory situation. The oom_adj value is a characteristic -of the task's mm, so all threads that share an mm with pid will have the same -oom_adj value. A high value will increase the likelihood of this process being -killed by the oom-killer. Valid values are in the range -16 to +15 as -explained below and a special value of -17, which disables oom-killing -altogether for threads sharing pid's mm. +This file can be used to adjust the score used to select which processes +should be killed in an out-of-memory situation. Giving it a high score will +increase the likelihood of this process being killed by the oom-killer. Valid +values are in the range -16 to +15, plus the special value -17, which disables +oom-killing altogether for this process. The process to be killed in an out-of-memory situation is selected among all others based on its badness score. This value equals the original memory size of the process @@ -1187,9 +1185,6 @@ the parent's score if they do not share the same memory. Thus forking servers are the prime candidates to be killed. Having only one 'hungry' child will make parent less preferable than the child. -/proc//oom_adj cannot be changed for kthreads since they are immune from -oom-killing already. - /proc//oom_score shows process' current badness score. The following heuristics are then applied: diff --git a/fs/proc/base.c b/fs/proc/base.c index 175db258942f..6f742f6658a9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - task_lock(task); - if (task->mm) - oom_adjust = task->mm->oom_adj; - else - oom_adjust = OOM_DISABLE; - task_unlock(task); + oom_adjust = task->oomkilladj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - task_lock(task); - if (!task->mm) { - task_unlock(task); - put_task_struct(task); - return -EINVAL; - } - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { - task_unlock(task); + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { put_task_struct(task); return -EACCES; } - task->mm->oom_adj = oom_adjust; - task_unlock(task); + task->oomkilladj = oom_adjust; put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7acc8439d9b3..0042090a4d70 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,8 +240,6 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - s8 oom_adj; /* OOM kill score adjustment (bit shift) */ - cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b8..0f1ea4a66957 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1198,6 +1198,7 @@ struct task_struct { * a short time */ unsigned char fpu_counter; + s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 021e1138556e..144326b7af50 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; - mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a99..a7b2460e922b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; task_lock(p); mm = p->mm; @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oom_adj. + * Adjust the score by oomkilladj. */ - if (oom_adj) { - if (oom_adj > 0) { + if (p->oomkilladj) { + if (p->oomkilladj > 0) { if (!points) points = 1; - points <<= oom_adj; + points <<= p->oomkilladj; } else - points >>= -(oom_adj); + points >>= -(p->oomkilladj); } #ifdef DEBUG @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; - task_lock(p); mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) return 1; - } - task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + __oom_kill_task(p, 1); /* @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } -- cgit 1.4.1 From 28d7a6ae92c099d81cbea08c20be0d2cf7ccd7ca Mon Sep 17 00:00:00 2001 From: Graff Yang Date: Tue, 18 Aug 2009 14:11:17 -0700 Subject: nommu: check fd read permission in validate_mmap_request() According to the POSIX (1003.1-2008), the file descriptor shall have been opened with read permission, regardless of the protection options specified to mmap(). The ltp test cases mmap06/07 need this. Signed-off-by: Graff Yang Acked-by: Paul Mundt Signed-off-by: David Howells Acked-by: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 28754c40be98..4bde489ec431 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -919,6 +919,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && -- cgit 1.4.1 From 7f9cfb31030737a7fc9a1cbca3fd01bec184c849 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Tue, 18 Aug 2009 14:11:19 -0700 Subject: mm: build_zonelists(): move clear node_load[] to __build_all_zonelists() If node_load[] is cleared everytime build_zonelists() is called,node_load[] will have no help to find the next node that should appear in the given node's fallback list. Because of the bug, zonelist's node_order is not calculated as expected. This bug affects on big machine, which has asynmetric node distance. [synmetric NUMA's node distance] 0 1 2 0 10 12 12 1 12 10 12 2 12 12 10 [asynmetric NUMA's node distance] 0 1 2 0 10 12 20 1 12 10 14 2 20 14 10 This (my bug) is very old but no one has reported this for a long time. Maybe because the number of asynmetric NUMA is very small and they use cpuset for customizing node memory allocation fallback. [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Signed-off-by: Bo Liu Reviewed-by: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d052abbe3063..5cc986eb9f6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2544,7 +2544,6 @@ static void build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2653,6 +2652,9 @@ static int __build_all_zonelists(void *dummy) { int nid; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); -- cgit 1.4.1