From 0abdd7a81b7e3fd781d7fabcca49501852bba17e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 21 Jan 2014 15:48:12 -0800 Subject: dma-debug: introduce debug_dma_assert_idle() Record actively mapped pages and provide an api for asserting a given page is dma inactive before execution proceeds. Placing debug_dma_assert_idle() in cow_user_page() flagged the violation of the dma-api in the NET_DMA implementation (see commit 77873803363c "net_dma: mark broken"). The implementation includes the capability to count, in a limited way, repeat mappings of the same page that occur without an intervening unmap. This 'overlap' counter is limited to the few bits of tag space in a radix tree. This mechanism is added to mitigate false negative cases where, for example, a page is dma mapped twice and debug_dma_assert_idle() is called after the page is un-mapped once. Signed-off-by: Dan Williams Cc: Joerg Roedel Cc: Vinod Koul Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-debug.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index fc0e34ce038f..fe8cb610deac 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -85,6 +85,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev, extern void debug_dma_dump_mappings(struct device *dev); +extern void debug_dma_assert_idle(struct page *page); + #else /* CONFIG_DMA_API_DEBUG */ static inline void dma_debug_add_bus(struct bus_type *bus) @@ -183,6 +185,10 @@ static inline void debug_dma_dump_mappings(struct device *dev) { } +static inline void debug_dma_assert_idle(struct page *page) +{ +} + #endif /* CONFIG_DMA_API_DEBUG */ #endif /* __DMA_DEBUG_H */ -- cgit 1.4.1 From 7053aee26a3548ebaba046ae2e52396ccf56ac6c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Jan 2014 15:48:14 -0800 Subject: fsnotify: do not share events between notification groups Currently fsnotify framework creates one event structure for each notification event and links this event into all interested notification groups. This is done so that we save memory when several notification groups are interested in the event. However the need for event structure shared between inotify & fanotify bloats the event structure so the result is often higher memory consumption. Another problem is that fsnotify framework keeps path references with outstanding events so that fanotify can return open file descriptors with its events. This has the undesirable effect that filesystem cannot be unmounted while there are outstanding events - a regression for inotify compared to a situation before it was converted to fsnotify framework. For fanotify this problem is hard to avoid and users of fanotify should kind of expect this behavior when they ask for file descriptors from notified files. This patch changes fsnotify and its users to create separate event structure for each group. This allows for much simpler code (~400 lines removed by this patch) and also smaller event structures. For example on 64-bit system original struct fsnotify_event consumes 120 bytes, plus additional space for file name, additional 24 bytes for second and each subsequent group linking the event, and additional 32 bytes for each inotify group for private data. After the conversion inotify event consumes 48 bytes plus space for file name which is considerably less memory unless file names are long and there are several groups interested in the events (both of which are uncommon). Fanotify event fits in 56 bytes after the conversion (fanotify doesn't care about file names so its events don't have to have it allocated). A win unless there are four or more fanotify groups interested in the event. The conversion also solves the problem with unmount when only inotify is used as we don't have to grab path references for inotify events. [hughd@google.com: fanotify: fix corruption preventing startup] Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Eric Paris Cc: Al Viro Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 11 +- fs/notify/fanotify/fanotify.c | 211 +++++++++++----------- fs/notify/fanotify/fanotify.h | 23 +++ fs/notify/fanotify/fanotify_user.c | 41 +++-- fs/notify/fsnotify.c | 37 ++-- fs/notify/group.c | 1 + fs/notify/inotify/inotify.h | 21 ++- fs/notify/inotify/inotify_fsnotify.c | 125 +++++-------- fs/notify/inotify/inotify_user.c | 86 +++------ fs/notify/notification.c | 334 +++-------------------------------- include/linux/fsnotify_backend.h | 114 +++--------- kernel/audit_tree.c | 8 +- kernel/audit_watch.c | 14 +- 13 files changed, 318 insertions(+), 708 deletions(-) create mode 100644 fs/notify/fanotify/fanotify.h (limited to 'include') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1fedd5f7ccc4..bfca53dbbf34 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -82,21 +82,20 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) * events. */ static int dnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct dnotify_mark *dn_mark; - struct inode *to_tell; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; - __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; + __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; BUG_ON(vfsmount_mark); - to_tell = event->to_tell; - dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); spin_lock(&inode_mark->lock); @@ -155,7 +154,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = { .should_send_event = dnotify_should_send_event, .free_group_priv = NULL, .freeing_mark = NULL, - .free_event_priv = NULL, + .free_event = NULL, }; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0c2f9122b262..c26268d7bd9d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -9,31 +9,27 @@ #include #include -static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) +#include "fanotify.h" + +static bool should_merge(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - pr_debug("%s: old=%p new=%p\n", __func__, old, new); + struct fanotify_event_info *old, *new; - if (old->to_tell == new->to_tell && - old->data_type == new->data_type && - old->tgid == new->tgid) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_PATH): #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - /* dont merge two permission events */ - if ((old->mask & FAN_ALL_PERM_EVENTS) && - (new->mask & FAN_ALL_PERM_EVENTS)) - return false; + /* dont merge two permission events */ + if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) && + (new_fsn->mask & FAN_ALL_PERM_EVENTS)) + return false; #endif - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - return true; - default: - BUG(); - }; - } + pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); + old = FANOTIFY_E(old_fsn); + new = FANOTIFY_E(new_fsn); + + if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry) + return true; return false; } @@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) static struct fsnotify_event *fanotify_merge(struct list_head *list, struct fsnotify_event *event) { - struct fsnotify_event_holder *test_holder; - struct fsnotify_event *test_event = NULL; - struct fsnotify_event *new_event; + struct fsnotify_event *test_event; + bool do_merge = false; pr_debug("%s: list=%p event=%p\n", __func__, list, event); - - list_for_each_entry_reverse(test_holder, list, event_list) { - if (should_merge(test_holder->event, event)) { - test_event = test_holder->event; + list_for_each_entry_reverse(test_event, list, list) { + if (should_merge(test_event, event)) { + do_merge = true; break; } } - if (!test_event) + if (!do_merge) return NULL; - fsnotify_get_event(test_event); - - /* if they are exactly the same we are done */ - if (test_event->mask == event->mask) - return test_event; - - /* - * if the refcnt == 2 this is the only queue - * for this event and so we can update the mask - * in place. - */ - if (atomic_read(&test_event->refcnt) == 2) { - test_event->mask |= event->mask; - return test_event; - } - - new_event = fsnotify_clone_event(test_event); - - /* done with test_event */ - fsnotify_put_event(test_event); - - /* couldn't allocate memory, merge was not possible */ - if (unlikely(!new_event)) - return ERR_PTR(-ENOMEM); - - /* build new event and replace it on the list */ - new_event->mask = (test_event->mask | event->mask); - fsnotify_replace_event(test_holder, new_event); - - /* we hold a reference on new_event from clone_event */ - return new_event; + test_event->mask |= event->mask; + return test_event; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response_from_access(struct fsnotify_group *group, - struct fsnotify_event *event) + struct fanotify_event_info *event) { int ret; @@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, return 0; /* userspace responded, convert to something usable */ - spin_lock(&event->lock); switch (event->response) { case FAN_ALLOW: ret = 0; @@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, ret = -EPERM; } event->response = 0; - spin_unlock(&event->lock); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -125,48 +88,8 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, } #endif -static int fanotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *fanotify_mark, - struct fsnotify_event *event) -{ - int ret = 0; - struct fsnotify_event *notify_event = NULL; - - BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); - BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); - BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); - BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); - BUILD_BUG_ON(FAN_OPEN != FS_OPEN); - BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); - BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); - BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); - BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); - BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); - if (IS_ERR(notify_event)) - return PTR_ERR(notify_event); - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - /* if we merged we need to wait on the new event */ - if (notify_event) - event = notify_event; - ret = fanotify_get_response_from_access(group, event); - } -#endif - - if (notify_event) - fsnotify_put_event(notify_event); - - return ret; -} - static bool fanotify_should_send_event(struct fsnotify_group *group, - struct inode *to_tell, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, __u32 event_mask, void *data, int data_type) @@ -174,8 +97,8 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, __u32 marks_mask, marks_ignored_mask; struct path *path = data; - pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " - "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, + pr_debug("%s: group=%p inode=%p inode_mark=%p vfsmnt_mark=%p " + "mask=%x data=%p data_type=%d\n", __func__, group, inode, inode_mark, vfsmnt_mark, event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ @@ -217,6 +140,70 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, return false; } +static int fanotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *fanotify_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) +{ + int ret = 0; + struct fanotify_event_info *event; + struct fsnotify_event *fsn_event; + struct fsnotify_event *notify_fsn_event; + + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); + + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (unlikely(!event)) + return -ENOMEM; + + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (data_type == FSNOTIFY_EVENT_PATH) { + struct path *path = data; + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + event->response = 0; +#endif + + notify_fsn_event = fsnotify_add_notify_event(group, fsn_event, + fanotify_merge); + if (notify_fsn_event) { + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + if (IS_ERR(notify_fsn_event)) + return PTR_ERR(notify_fsn_event); + /* We need to ask about a different events after a merge... */ + event = FANOTIFY_E(notify_fsn_event); + fsn_event = notify_fsn_event; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (fsn_event->mask & FAN_ALL_PERM_EVENTS) + ret = fanotify_get_response_from_access(group, event); +#endif + return ret; +} + static void fanotify_free_group_priv(struct fsnotify_group *group) { struct user_struct *user; @@ -226,10 +213,20 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) free_uid(user); } +static void fanotify_free_event(struct fsnotify_event *fsn_event) +{ + struct fanotify_event_info *event; + + event = FANOTIFY_E(fsn_event); + path_put(&event->path); + put_pid(event->tgid); + kmem_cache_free(fanotify_event_cachep, event); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .should_send_event = fanotify_should_send_event, .free_group_priv = fanotify_free_group_priv, - .free_event_priv = NULL, + .free_event = fanotify_free_event, .freeing_mark = NULL, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h new file mode 100644 index 000000000000..0e90174a116a --- /dev/null +++ b/fs/notify/fanotify/fanotify.h @@ -0,0 +1,23 @@ +#include +#include +#include + +extern struct kmem_cache *fanotify_event_cachep; + +struct fanotify_event_info { + struct fsnotify_event fse; + /* + * We hold ref to this path so it may be dereferenced at any point + * during this object's lifetime + */ + struct path path; + struct pid *tgid; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + u32 response; /* userspace answer to question */ +#endif +}; + +static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_event_info, fse); +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e44cb6427df3..57d7c083cb4b 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -19,6 +19,7 @@ #include "../../mount.h" #include "../fdinfo.h" +#include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 @@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; static struct kmem_cache *fanotify_response_event_cache __read_mostly; +struct kmem_cache *fanotify_event_cachep __read_mostly; struct fanotify_response_event { struct list_head list; __s32 fd; - struct fsnotify_event *event; + struct fanotify_event_info *event; }; /* @@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, } static int create_fd(struct fsnotify_group *group, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_info *event, + struct file **file) { int client_fd; struct file *new_file; @@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group, if (client_fd < 0) return client_fd; - if (event->data_type != FSNOTIFY_EVENT_PATH) { - WARN_ON(1); - put_unused_fd(client_fd); - return -EINVAL; - } - /* * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. @@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group, } static int fill_event_metadata(struct fsnotify_group *group, - struct fanotify_event_metadata *metadata, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_metadata *metadata, + struct fsnotify_event *fsn_event, + struct file **file) { int ret = 0; + struct fanotify_event_info *event; pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, - group, metadata, event); + group, metadata, fsn_event); *file = NULL; + event = container_of(fsn_event, struct fanotify_event_info, fse); metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; + metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - if (unlikely(event->mask & FAN_Q_OVERFLOW)) + if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { metadata->fd = create_fd(group, event, file); @@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (!re) return -ENOMEM; - re->event = event; + re->event = FANOTIFY_E(event); re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); @@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (atomic_read(&group->fanotify_data.bypass_perm)) { mutex_unlock(&group->fanotify_data.access_mutex); kmem_cache_free(fanotify_response_event_cache, re); - event->response = FAN_ALLOW; + FANOTIFY_E(event)->response = FAN_ALLOW; return 0; } @@ -273,7 +271,7 @@ out_close_fd: out: #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (event->mask & FAN_ALL_PERM_EVENTS) { - event->response = FAN_DENY; + FANOTIFY_E(event)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); } #endif @@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) + list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void) fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); return 0; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4bb21d67d9b1..7c754c91c3f6 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, void *data, int data_is, u32 cookie, - const unsigned char *file_name, - struct fsnotify_event **event) + const unsigned char *file_name) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -170,10 +169,10 @@ static int send_to_group(struct inode *to_tell, pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" - " data=%p data_is=%d cookie=%d event=%p\n", + " data=%p data_is=%d cookie=%d\n", __func__, group, to_tell, mask, inode_mark, inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, - data_is, cookie, *event); + data_is, cookie); if (!inode_test_mask && !vfsmount_test_mask) return 0; @@ -183,14 +182,9 @@ static int send_to_group(struct inode *to_tell, data_is) == false) return 0; - if (!*event) { - *event = fsnotify_create_event(to_tell, mask, data, - data_is, file_name, - cookie, GFP_KERNEL); - if (!*event) - return -ENOMEM; - } - return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); + return group->ops->handle_event(group, to_tell, inode_mark, + vfsmount_mark, mask, data, data_is, + file_name); } /* @@ -205,7 +199,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; - struct fsnotify_event *event = NULL; struct mount *mnt; int idx, ret = 0; /* global tests shouldn't care about events on child only the specific event */ @@ -258,18 +251,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - ret = send_to_group(to_tell, inode_mark, NULL, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, inode_mark, NULL, mask, + data, data_is, cookie, file_name); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, + data, data_is, cookie, file_name); inode_group = NULL; } else { ret = send_to_group(to_tell, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, file_name, - &event); + mask, data, data_is, cookie, + file_name); } if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) @@ -285,12 +278,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, idx); - /* - * fsnotify_create_event() took a reference so the event can't be cleaned - * up while we are still trying to add it to lists, drop that one. - */ - if (event) - fsnotify_put_event(event); return ret; } diff --git a/fs/notify/group.c b/fs/notify/group.c index bd2625bd88b4..ee674fe2cec7 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) INIT_LIST_HEAD(&group->marks_list); group->ops = ops; + fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); return group; } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index b6642e4de4bf..485eef3f4407 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -2,11 +2,12 @@ #include #include /* struct kmem_cache */ -extern struct kmem_cache *event_priv_cachep; - -struct inotify_event_private_data { - struct fsnotify_event_private_data fsnotify_event_priv_data; +struct inotify_event_info { + struct fsnotify_event fse; int wd; + u32 sync_cookie; + int name_len; + char name[]; }; struct inotify_inode_mark { @@ -14,8 +15,18 @@ struct inotify_inode_mark { int wd; }; +static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct inotify_event_info, fse); +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); -extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); +extern int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 4216308b81b4..6fabbd163d16 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -34,100 +34,80 @@ #include "inotify.h" /* - * Check if 2 events contain the same information. We do not compare private data - * but at this moment that isn't a problem for any know fsnotify listeners. + * Check if 2 events contain the same information. */ -static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) +static bool event_compare(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - if ((old->mask == new->mask) && - (old->to_tell == new->to_tell) && - (old->data_type == new->data_type) && - (old->name_len == new->name_len)) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_INODE): - /* remember, after old was put on the wait_q we aren't - * allowed to look at the inode any more, only thing - * left to check was if the file_name is the same */ - if (!old->name_len || - !strcmp(old->file_name, new->file_name)) - return true; - break; - case (FSNOTIFY_EVENT_PATH): - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - if (old->mask & FS_Q_OVERFLOW) - return true; - else if (old->mask & FS_IN_IGNORED) - return false; - return true; - }; - } + struct inotify_event_info *old, *new; + + if (old_fsn->mask & FS_IN_IGNORED) + return false; + old = INOTIFY_E(old_fsn); + new = INOTIFY_E(new_fsn); + if ((old_fsn->mask == new_fsn->mask) && + (old_fsn->inode == new_fsn->inode) && + (old->name_len == new->name_len) && + (!old->name_len || !strcmp(old->name, new->name))) + return true; return false; } static struct fsnotify_event *inotify_merge(struct list_head *list, struct fsnotify_event *event) { - struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - /* and the list better be locked by something too */ - spin_lock(&event->lock); - - last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); - last_event = last_holder->event; - if (event_compare(last_event, event)) - fsnotify_get_event(last_event); - else - last_event = NULL; - - spin_unlock(&event->lock); - + last_event = list_entry(list->prev, struct fsnotify_event, list); + if (!event_compare(last_event, event)) + return NULL; return last_event; } -static int inotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) +int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct inotify_inode_mark *i_mark; - struct inode *to_tell; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; + struct inotify_event_info *event; struct fsnotify_event *added_event; - int wd, ret = 0; + struct fsnotify_event *fsn_event; + int ret = 0; + int len = 0; + int alloc_len = sizeof(struct inotify_event_info); BUG_ON(vfsmount_mark); - pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, - event, event->to_tell, event->mask); + if (file_name) { + len = strlen(file_name); + alloc_len += len + 1; + } - to_tell = event->to_tell; + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - wd = i_mark->wd; - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); - if (unlikely(!event_priv)) + event = kmalloc(alloc_len, GFP_KERNEL); + if (unlikely(!event)) return -ENOMEM; - fsn_event_priv = &event_priv->fsnotify_event_priv_data; + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->wd = i_mark->wd; + event->name_len = len; + if (len) + strcpy(event->name, file_name); - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = wd; - - added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); + added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge); if (added_event) { - inotify_free_event_priv(fsn_event_priv); - if (!IS_ERR(added_event)) - fsnotify_put_event(added_event); - else + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + if (IS_ERR(added_event)) ret = PTR_ERR(added_event); } @@ -202,22 +182,15 @@ static void inotify_free_group_priv(struct fsnotify_group *group) free_uid(group->inotify_data.user); } -void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) +static void inotify_free_event(struct fsnotify_event *fsn_event) { - struct inotify_event_private_data *event_priv; - - - event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - - fsnotify_put_group(fsn_event_priv->group); - kmem_cache_free(event_priv_cachep, event_priv); + kfree(INOTIFY_E(fsn_event)); } const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, - .free_event_priv = inotify_free_event_priv, + .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1bb6dc8eaf1c..497395c8274b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly; static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; -struct kmem_cache *event_priv_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -124,8 +123,11 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) return ret; } -static int round_event_name_len(struct fsnotify_event *event) +static int round_event_name_len(struct fsnotify_event *fsn_event) { + struct inotify_event_info *event; + + event = INOTIFY_E(fsn_event); if (!event->name_len) return 0; return roundup(event->name_len + 1, sizeof(struct inotify_event)); @@ -169,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, + struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; - struct fsnotify_event_private_data *fsn_priv; - struct inotify_event_private_data *priv; + struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); size_t name_len; size_t pad_name_len; - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - /* we get the inotify watch descriptor from the event private data */ - spin_lock(&event->lock); - fsn_priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - - if (!fsn_priv) - inotify_event.wd = -1; - else { - priv = container_of(fsn_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - inotify_event.wd = priv->wd; - inotify_free_event_priv(fsn_priv); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + event = INOTIFY_E(fsn_event); name_len = event->name_len; /* * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ - pad_name_len = round_event_name_len(event); + pad_name_len = round_event_name_len(fsn_event); inotify_event.len = pad_name_len; - inotify_event.mask = inotify_mask_to_arg(event->mask); + inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ @@ -218,7 +207,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, */ if (pad_name_len) { /* copy the path name */ - if (copy_to_user(buf, event->file_name, name_len)) + if (copy_to_user(buf, event->name, name_len)) return -EFAULT; buf += name_len; @@ -257,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -300,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; - struct fsnotify_event *event; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -314,10 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) { - event = holder->event; + list_for_each_entry(fsn_event, &group->notification_list, + list) { send_len += sizeof(struct inotify_event); - send_len += round_event_name_len(event); + send_len += round_event_name_len(fsn_event); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -504,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; - struct fsnotify_event *ignored_event, *notify_event; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - int ret; - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_NOFS); - if (!ignored_event) - goto skip_send_ignore; - - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); - if (unlikely(!event_priv)) - goto skip_send_ignore; - - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = i_mark->wd; - - notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); - if (notify_event) { - if (IS_ERR(notify_event)) - ret = PTR_ERR(notify_event); - else - fsnotify_put_event(notify_event); - inotify_free_event_priv(fsn_event_priv); - } - -skip_send_ignore: - /* matches the reference taken when the event was created */ - if (ignored_event) - fsnotify_put_event(ignored_event); + /* Queue ignore event for the watch */ + inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, + NULL, FSNOTIFY_EVENT_NONE, NULL); + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); @@ -837,7 +794,6 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); - event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 7b51b05f160c..952237b8e2d2 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -48,15 +48,6 @@ #include #include "fsnotify.h" -static struct kmem_cache *fsnotify_event_cachep; -static struct kmem_cache *fsnotify_event_holder_cachep; -/* - * This is a magic event we send when the q is too full. Since it doesn't - * hold real event information we just keep one system wide and use it any time - * it is needed. It's refcnt is set 1 at kernel init time and will never - * get set to 0 so it will never get 'freed' - */ -static struct fsnotify_event *q_overflow_event; static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); /** @@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) return list_empty(&group->notification_list) ? true : false; } -void fsnotify_get_event(struct fsnotify_event *event) +void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event) { - atomic_inc(&event->refcnt); -} - -void fsnotify_put_event(struct fsnotify_event *event) -{ - if (!event) + /* Overflow events are per-group and we don't want to free them */ + if (!event || event->mask == FS_Q_OVERFLOW) return; - if (atomic_dec_and_test(&event->refcnt)) { - pr_debug("%s: event=%p\n", __func__, event); - - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_put(&event->path); - - BUG_ON(!list_empty(&event->private_data_list)); - - kfree(event->file_name); - put_pid(event->tgid); - kmem_cache_free(fsnotify_event_cachep, event); - } -} - -struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) -{ - return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); -} - -void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) -{ - if (holder) - kmem_cache_free(fsnotify_event_holder_cachep, holder); -} - -/* - * Find the private data that the group previously attached to this event when - * the group added the event to the notification queue (fsnotify_add_notify_event) - */ -struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) -{ - struct fsnotify_event_private_data *lpriv; - struct fsnotify_event_private_data *priv = NULL; - - assert_spin_locked(&event->lock); - - list_for_each_entry(lpriv, &event->private_data_list, event_list) { - if (lpriv->group == group) { - priv = lpriv; - list_del(&priv->event_list); - break; - } - } - return priv; + group->ops->free_event(event); } /* @@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot * event off the queue to deal with. If the event is successfully added to the * group's notification queue, a reference is taken on event. */ -struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv, +struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, + struct fsnotify_event *event, struct fsnotify_event *(*merge)(struct list_head *, struct fsnotify_event *)) { struct fsnotify_event *return_event = NULL; - struct fsnotify_event_holder *holder = NULL; struct list_head *list = &group->notification_list; - pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); - - /* - * There is one fsnotify_event_holder embedded inside each fsnotify_event. - * Check if we expect to be able to use that holder. If not alloc a new - * holder. - * For the overflow event it's possible that something will use the in - * event holder before we get the lock so we may need to jump back and - * alloc a new holder, this can't happen for most events... - */ - if (!list_empty(&event->holder.event_list)) { -alloc_holder: - holder = fsnotify_alloc_event_holder(); - if (!holder) - return ERR_PTR(-ENOMEM); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, event); mutex_lock(&group->notification_mutex); if (group->q_len >= group->max_events) { - event = q_overflow_event; - - /* - * we need to return the overflow event - * which means we need a ref - */ - fsnotify_get_event(event); + /* Queue overflow event only if it isn't already queued */ + if (list_empty(&group->overflow_event.list)) + event = &group->overflow_event; return_event = event; - - /* sorry, no private data on the overflow event */ - priv = NULL; } if (!list_empty(list) && merge) { - struct fsnotify_event *tmp; - - tmp = merge(list, event); - if (tmp) { - mutex_unlock(&group->notification_mutex); - - if (return_event) - fsnotify_put_event(return_event); - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - return tmp; - } - } - - spin_lock(&event->lock); - - if (list_empty(&event->holder.event_list)) { - if (unlikely(holder)) - fsnotify_destroy_event_holder(holder); - holder = &event->holder; - } else if (unlikely(!holder)) { - /* between the time we checked above and got the lock the in - * event holder was used, go back and get a new one */ - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - + return_event = merge(list, event); if (return_event) { - fsnotify_put_event(return_event); - return_event = NULL; + mutex_unlock(&group->notification_mutex); + return return_event; } - - goto alloc_holder; } group->q_len++; - holder->event = event; - - fsnotify_get_event(event); - list_add_tail(&holder->event_list, list); - if (priv) - list_add_tail(&priv->event_list, &event->private_data_list); - spin_unlock(&event->lock); + list_add_tail(&event->list, list); mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); @@ -230,32 +119,20 @@ alloc_holder: } /* - * Remove and return the first event from the notification list. There is a - * reference held on this event since it was on the list. It is the responsibility - * of the caller to drop this reference. + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event */ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_holder *holder; BUG_ON(!mutex_is_locked(&group->notification_mutex)); pr_debug("%s: group=%p\n", __func__, group); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - - event = holder->event; - - spin_lock(&event->lock); - holder->event = NULL; - list_del_init(&holder->event_list); - spin_unlock(&event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - + event = list_first_entry(&group->notification_list, + struct fsnotify_event, list); + list_del(&event->list); group->q_len--; return event; @@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group */ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) { - struct fsnotify_event *event; - struct fsnotify_event_holder *holder; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - event = holder->event; - - return event; + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); } /* @@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_private_data *priv; mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_notify_event(group); - /* if they don't implement free_event_priv they better not have attached any */ - if (group->ops->free_event_priv) { - spin_lock(&event->lock); - priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - if (priv) - group->ops->free_event_priv(priv); - } - fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ + fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); } -static void initialize_event(struct fsnotify_event *event) -{ - INIT_LIST_HEAD(&event->holder.event_list); - atomic_set(&event->refcnt, 1); - - spin_lock_init(&event->lock); - - INIT_LIST_HEAD(&event->private_data_list); -} - -/* - * Caller damn well better be holding whatever mutex is protecting the - * old_holder->event_list and the new_event must be a clean event which - * cannot be found anywhere else in the kernel. - */ -int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, - struct fsnotify_event *new_event) -{ - struct fsnotify_event *old_event = old_holder->event; - struct fsnotify_event_holder *new_holder = &new_event->holder; - - enum event_spinlock_class { - SPINLOCK_OLD, - SPINLOCK_NEW, - }; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); - - /* - * if the new_event's embedded holder is in use someone - * screwed up and didn't give us a clean new event. - */ - BUG_ON(!list_empty(&new_holder->event_list)); - - spin_lock_nested(&old_event->lock, SPINLOCK_OLD); - spin_lock_nested(&new_event->lock, SPINLOCK_NEW); - - new_holder->event = new_event; - list_replace_init(&old_holder->event_list, &new_holder->event_list); - - spin_unlock(&new_event->lock); - spin_unlock(&old_event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (old_holder != &old_event->holder) - fsnotify_destroy_event_holder(old_holder); - - fsnotify_get_event(new_event); /* on the list take reference */ - fsnotify_put_event(old_event); /* off the list, drop reference */ - - return 0; -} - -struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) -{ - struct fsnotify_event *event; - - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); - if (!event) - return NULL; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); - - memcpy(event, old_event, sizeof(*event)); - initialize_event(event); - - if (event->name_len) { - event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - } - event->tgid = get_pid(old_event->tgid); - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_get(&event->path); - - return event; -} - /* * fsnotify_create_event - Allocate a new event which will be sent to each * group's handle_event function if the group was interested in this * particular event. * - * @to_tell the inode which is supposed to receive the event (sometimes a + * @inode the inode which is supposed to receive the event (sometimes a * parent of the inode to which the event happened. * @mask what actually happened. * @data pointer to the object which was actually affected * @data_type flag indication if the data is a file, path, inode, nothing... * @name the filename, if available */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const unsigned char *name, - u32 cookie, gfp_t gfp) +void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, + u32 mask) { - struct fsnotify_event *event; - - event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); - if (!event) - return NULL; - - pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", - __func__, event, to_tell, mask, data, data_type); - - initialize_event(event); - - if (name) { - event->file_name = kstrdup(name, gfp); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - event->name_len = strlen(event->file_name); - } - - event->tgid = get_pid(task_tgid(current)); - event->sync_cookie = cookie; - event->to_tell = to_tell; - event->data_type = data_type; - - switch (data_type) { - case FSNOTIFY_EVENT_PATH: { - struct path *path = data; - event->path.dentry = path->dentry; - event->path.mnt = path->mnt; - path_get(&event->path); - break; - } - case FSNOTIFY_EVENT_INODE: - event->inode = data; - break; - case FSNOTIFY_EVENT_NONE: - event->inode = NULL; - event->path.dentry = NULL; - event->path.mnt = NULL; - break; - default: - BUG(); - } - + INIT_LIST_HEAD(&event->list); + event->inode = inode; event->mask = mask; - - return event; -} - -static __init int fsnotify_notification_init(void) -{ - fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); - fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); - - q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_KERNEL); - if (!q_overflow_event) - panic("unable to allocate fsnotify q_overflow_event\n"); - - return 0; } -subsys_initcall(fsnotify_notification_init); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 4b2ee8d12f5e..7f3d7dcfcd00 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -15,7 +15,6 @@ #include /* struct path */ #include #include - #include /* @@ -79,6 +78,7 @@ struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; +struct fsnotify_fname; /* * Each group much define these ops. The fsnotify infrastructure will call @@ -99,12 +99,26 @@ struct fsnotify_ops { struct fsnotify_mark *vfsmount_mark, __u32 mask, void *data, int data_type); int (*handle_event)(struct fsnotify_group *group, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event); + u32 mask, void *data, int data_type, + const unsigned char *file_name); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); - void (*free_event_priv)(struct fsnotify_event_private_data *priv); + void (*free_event)(struct fsnotify_event *event); +}; + +/* + * all of the information about the original object we want to now send to + * a group. If you want to carry more info from the accessing task to the + * listener this structure is where you need to be adding fields. + */ +struct fsnotify_event { + struct list_head list; + /* inode may ONLY be dereferenced during handle_event(). */ + struct inode *inode; /* either the inode the event happened to or its parent */ + u32 mask; /* the type of access, bitwise OR for FS_* event types */ }; /* @@ -148,7 +162,11 @@ struct fsnotify_group { * a group */ struct list_head marks_list; /* all inode marks for this group */ - struct fasync_struct *fsn_fa; /* async notification */ + struct fasync_struct *fsn_fa; /* async notification */ + + struct fsnotify_event overflow_event; /* Event we queue when the + * notification list is too + * full */ /* groups can define private fields here or use the void *private */ union { @@ -177,76 +195,10 @@ struct fsnotify_group { }; }; -/* - * A single event can be queued in multiple group->notification_lists. - * - * each group->notification_list will point to an event_holder which in turns points - * to the actual event that needs to be sent to userspace. - * - * Seemed cheaper to create a refcnt'd event and a small holder for every group - * than create a different event for every group - * - */ -struct fsnotify_event_holder { - struct fsnotify_event *event; - struct list_head event_list; -}; - -/* - * Inotify needs to tack data onto an event. This struct lets us later find the - * correct private data of the correct group. - */ -struct fsnotify_event_private_data { - struct fsnotify_group *group; - struct list_head event_list; -}; - -/* - * all of the information about the original object we want to now send to - * a group. If you want to carry more info from the accessing task to the - * listener this structure is where you need to be adding fields. - */ -struct fsnotify_event { - /* - * If we create an event we are also likely going to need a holder - * to link to a group. So embed one holder in the event. Means only - * one allocation for the common case where we only have one group - */ - struct fsnotify_event_holder holder; - spinlock_t lock; /* protection for the associated event_holder and private_list */ - /* to_tell may ONLY be dereferenced during handle_event(). */ - struct inode *to_tell; /* either the inode the event happened to or its parent */ - /* - * depending on the event type we should have either a path or inode - * We hold a reference on path, but NOT on inode. Since we have the ref on - * the path, it may be dereferenced at any point during this object's - * lifetime. That reference is dropped when this object's refcnt hits - * 0. If this event contains an inode instead of a path, the inode may - * ONLY be used during handle_event(). - */ - union { - struct path path; - struct inode *inode; - }; /* when calling fsnotify tell it if the data is a path or inode */ #define FSNOTIFY_EVENT_NONE 0 #define FSNOTIFY_EVENT_PATH 1 #define FSNOTIFY_EVENT_INODE 2 - int data_type; /* which of the above union we have */ - atomic_t refcnt; /* how many groups still are using/need to send this event */ - __u32 mask; /* the type of access, bitwise OR for FS_* event types */ - - u32 sync_cookie; /* used to corrolate events, namely inotify mv events */ - const unsigned char *file_name; - size_t name_len; - struct pid *tgid; - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - __u32 response; /* userspace answer to question */ -#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ - - struct list_head private_data_list; /* groups can store private data here */ -}; /* * Inode specific fields in an fsnotify_mark @@ -370,17 +322,12 @@ extern void fsnotify_put_group(struct fsnotify_group *group); extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); -/* take a reference to an event */ -extern void fsnotify_get_event(struct fsnotify_event *event); -extern void fsnotify_put_event(struct fsnotify_event *event); -/* find private data previously attached to an event and unlink it */ -extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, - struct fsnotify_event *event); - +/* Free event from memory */ +extern void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event); /* attach the event to the group notification queue */ extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv, struct fsnotify_event *(*merge)(struct list_head *, struct fsnotify_event *)); /* true if the group notification queue is empty */ @@ -430,15 +377,8 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_unmount_inodes(struct list_head *list); /* put here because inotify does some weird stuff when destroying watches */ -extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, - void *data, int data_is, - const unsigned char *name, - u32 cookie, gfp_t gfp); - -/* fanotify likes to change events after they are on lists... */ -extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event); -extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, - struct fsnotify_event *new_event); +extern void fsnotify_init_event(struct fsnotify_event *event, + struct inode *to_tell, u32 mask); #else diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307dc9453..bcc0b1821227 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -912,9 +912,11 @@ static void evict_chunk(struct audit_chunk *chunk) } static int audit_tree_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmonut_mark, - struct fsnotify_event *event) + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { BUG(); return -EOPNOTSUPP; @@ -945,7 +947,7 @@ static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, .should_send_event = audit_tree_send_event, .free_group_priv = NULL, - .free_event_priv = NULL, + .free_event = NULL, .freeing_mark = audit_tree_freeing_mark, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4d369c..a760c32cb639 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -475,25 +475,25 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *dname) { struct inode *inode; - __u32 mask = event->mask; - const char *dname = event->file_name; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - switch (event->data_type) { + switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = event->path.dentry->d_inode; + inode = ((struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = event->inode; + inode = (struct inode *)data; break; default: BUG(); @@ -516,7 +516,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = { .handle_event = audit_watch_handle_event, .free_group_priv = NULL, .freeing_mark = NULL, - .free_event_priv = NULL, + .free_event = NULL, }; static int __init audit_watch_init(void) -- cgit 1.4.1 From 83c4c4b0a3aadc1ce7b5b2870ce1fc1f65498da0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Jan 2014 15:48:15 -0800 Subject: fsnotify: remove .should_send_event callback After removing event structure creation from the generic layer there is no reason for separate .should_send_event and .handle_event callbacks. So just remove the first one. Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 22 ++++------------------ fs/notify/fanotify/fanotify.c | 18 ++++++++++-------- fs/notify/fsnotify.c | 5 ----- fs/notify/inotify/inotify_fsnotify.c | 24 +++++++----------------- include/linux/fsnotify_backend.h | 4 ---- kernel/audit_tree.c | 12 +----------- kernel/audit_watch.c | 9 --------- 7 files changed, 22 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index bfca53dbbf34..928688e3ee2f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -94,6 +94,10 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fown_struct *fown; __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return 0; + BUG_ON(vfsmount_mark); dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); @@ -121,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group, return 0; } -/* - * Given an inode and mask determine if dnotify would be interested in sending - * userspace notification for that pair. - */ -static bool dnotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) - return false; - - return true; -} - static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, @@ -151,7 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, - .should_send_event = dnotify_should_send_event, .free_group_priv = NULL, .freeing_mark = NULL, .free_event = NULL, diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c26268d7bd9d..1f8f05220f8d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -88,18 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, } #endif -static bool fanotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, +static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, - __u32 event_mask, void *data, int data_type) + u32 event_mask, + void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; struct path *path = data; - pr_debug("%s: group=%p inode=%p inode_mark=%p vfsmnt_mark=%p " - "mask=%x data=%p data_type=%d\n", __func__, group, inode, - inode_mark, vfsmnt_mark, event_mask, data, data_type); + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, + event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) @@ -163,6 +162,10 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, + data_type)) + return 0; + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, mask); @@ -225,7 +228,6 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, - .should_send_event = fanotify_should_send_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, .freeing_mark = NULL, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7c754c91c3f6..1d4e1ea2f37c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -177,11 +177,6 @@ static int send_to_group(struct inode *to_tell, if (!inode_test_mask && !vfsmount_test_mask) return 0; - if (group->ops->should_send_event(group, to_tell, inode_mark, - vfsmount_mark, mask, data, - data_is) == false) - return 0; - return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, file_name); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 6fabbd163d16..aad1a35e9af1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -81,6 +81,13 @@ int inotify_handle_event(struct fsnotify_group *group, BUG_ON(vfsmount_mark); + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; + + if (d_unlinked(path->dentry)) + return 0; + } if (file_name) { len = strlen(file_name); alloc_len += len + 1; @@ -122,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify inotify_ignored_and_remove_idr(fsn_mark, group); } -static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; - - if (d_unlinked(path->dentry)) - return false; - } - - return true; -} - /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the @@ -189,7 +180,6 @@ static void inotify_free_event(struct fsnotify_event *fsn_event) const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, - .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7f3d7dcfcd00..7d8d5e608594 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -94,10 +94,6 @@ struct fsnotify_fname; * userspace messages that marks have been removed. */ struct fsnotify_ops { - bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type); int (*handle_event)(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index bcc0b1821227..ae8103b057fa 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -918,8 +918,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask, void *data, int data_type, const unsigned char *file_name) { - BUG(); - return -EOPNOTSUPP; + return 0; } static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) @@ -935,17 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify BUG_ON(atomic_read(&entry->refcnt) < 1); } -static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return false; -} - static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, - .should_send_event = audit_tree_send_event, .free_group_priv = NULL, .free_event = NULL, .freeing_mark = audit_tree_freeing_mark, diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index a760c32cb639..367ac9a79acc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -465,14 +465,6 @@ void audit_remove_watch_rule(struct audit_krule *krule) } } -static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return true; -} - /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, @@ -512,7 +504,6 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .should_send_event = audit_watch_should_send_event, .handle_event = audit_watch_handle_event, .free_group_priv = NULL, .freeing_mark = NULL, -- cgit 1.4.1 From 0afaa12047a45ebe651f29a3b4818e523f862c28 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Jan 2014 15:48:42 -0800 Subject: posix_acl: uninlining Uninline vast tracts of nested inline functions in include/linux/posix_acl.h. This reduces the text+data+bss size of x86_64 allyesconfig vmlinux by 8026 bytes. The patch also regularises the positioning of the EXPORT_SYMBOLs in posix_acl.c. Cc: Alexander Viro Cc: J. Bruce Fields Cc: Trond Myklebust Tested-by: Benny Halevy Cc: Benny Halevy Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/posix_acl.c | 84 ++++++++++++++++++++++++++++++++++++++++++++--- include/linux/posix_acl.h | 78 ++++--------------------------------------- 2 files changed, 85 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8bd2135b7f82..021e7c069b86 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -22,11 +22,80 @@ #include -EXPORT_SYMBOL(posix_acl_init); -EXPORT_SYMBOL(posix_acl_alloc); -EXPORT_SYMBOL(posix_acl_valid); -EXPORT_SYMBOL(posix_acl_equiv_mode); -EXPORT_SYMBOL(posix_acl_from_mode); +struct posix_acl **acl_by_type(struct inode *inode, int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return &inode->i_acl; + case ACL_TYPE_DEFAULT: + return &inode->i_default_acl; + default: + BUG(); + } +} +EXPORT_SYMBOL(acl_by_type); + +struct posix_acl *get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *acl = ACCESS_ONCE(*p); + if (acl) { + spin_lock(&inode->i_lock); + acl = *p; + if (acl != ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } + return acl; +} +EXPORT_SYMBOL(get_cached_acl); + +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) +{ + return rcu_dereference(*acl_by_type(inode, type)); +} +EXPORT_SYMBOL(get_cached_acl_rcu); + +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + rcu_assign_pointer(*p, posix_acl_dup(acl)); + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(set_cached_acl); + +void forget_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + *p = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(forget_cached_acl); + +void forget_all_cached_acls(struct inode *inode) +{ + struct posix_acl *old_access, *old_default; + spin_lock(&inode->i_lock); + old_access = inode->i_acl; + old_default = inode->i_default_acl; + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old_access != ACL_NOT_CACHED) + posix_acl_release(old_access); + if (old_default != ACL_NOT_CACHED) + posix_acl_release(old_default); +} +EXPORT_SYMBOL(forget_all_cached_acls); /* * Init a fresh posix_acl @@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count) atomic_set(&acl->a_refcount, 1); acl->a_count = count; } +EXPORT_SYMBOL(posix_acl_init); /* * Allocate a new ACL with the specified number of entries. @@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags) posix_acl_init(acl, count); return acl; } +EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. @@ -146,6 +217,7 @@ posix_acl_valid(const struct posix_acl *acl) return 0; return -EINVAL; } +EXPORT_SYMBOL(posix_acl_valid); /* * Returns 0 if the acl can be exactly represented in the traditional @@ -186,6 +258,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } +EXPORT_SYMBOL(posix_acl_equiv_mode); /* * Create an ACL representing the file mode permission bits of an inode. @@ -207,6 +280,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } +EXPORT_SYMBOL(posix_acl_from_mode); /* * Return 0 if current is granted want access to the inode diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 7931efe71175..fb616942e4c7 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -94,78 +94,12 @@ extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct inode *, int, struct posix_acl *); -#ifdef CONFIG_FS_POSIX_ACL -static inline struct posix_acl **acl_by_type(struct inode *inode, int type) -{ - switch (type) { - case ACL_TYPE_ACCESS: - return &inode->i_acl; - case ACL_TYPE_DEFAULT: - return &inode->i_default_acl; - default: - BUG(); - } -} - -static inline struct posix_acl *get_cached_acl(struct inode *inode, int type) -{ - struct posix_acl **p = acl_by_type(inode, type); - struct posix_acl *acl = ACCESS_ONCE(*p); - if (acl) { - spin_lock(&inode->i_lock); - acl = *p; - if (acl != ACL_NOT_CACHED) - acl = posix_acl_dup(acl); - spin_unlock(&inode->i_lock); - } - return acl; -} - -static inline struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) -{ - return rcu_dereference(*acl_by_type(inode, type)); -} - -static inline void set_cached_acl(struct inode *inode, - int type, - struct posix_acl *acl) -{ - struct posix_acl **p = acl_by_type(inode, type); - struct posix_acl *old; - spin_lock(&inode->i_lock); - old = *p; - rcu_assign_pointer(*p, posix_acl_dup(acl)); - spin_unlock(&inode->i_lock); - if (old != ACL_NOT_CACHED) - posix_acl_release(old); -} - -static inline void forget_cached_acl(struct inode *inode, int type) -{ - struct posix_acl **p = acl_by_type(inode, type); - struct posix_acl *old; - spin_lock(&inode->i_lock); - old = *p; - *p = ACL_NOT_CACHED; - spin_unlock(&inode->i_lock); - if (old != ACL_NOT_CACHED) - posix_acl_release(old); -} - -static inline void forget_all_cached_acls(struct inode *inode) -{ - struct posix_acl *old_access, *old_default; - spin_lock(&inode->i_lock); - old_access = inode->i_acl; - old_default = inode->i_default_acl; - inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; - spin_unlock(&inode->i_lock); - if (old_access != ACL_NOT_CACHED) - posix_acl_release(old_access); - if (old_default != ACL_NOT_CACHED) - posix_acl_release(old_default); -} -#endif +struct posix_acl **acl_by_type(struct inode *inode, int type); +struct posix_acl *get_cached_acl(struct inode *inode, int type); +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); +void forget_cached_acl(struct inode *inode, int type); +void forget_all_cached_acls(struct inode *inode); static inline void cache_no_acl(struct inode *inode) { -- cgit 1.4.1 From f92f455f67fef27929e6043499414605b0c94872 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 21 Jan 2014 15:48:47 -0800 Subject: mm: Make {,set}page_address() static inline if WANT_PAGE_VIRTUAL {,set}page_address() are macros if WANT_PAGE_VIRTUAL. If !WANT_PAGE_VIRTUAL, they're plain C functions. If someone calls them with a void *, this pointer is auto-converted to struct page * if !WANT_PAGE_VIRTUAL, but causes a build failure on architectures using WANT_PAGE_VIRTUAL (arc, m68k and sparc64): drivers/md/bcache/bset.c: In function `__btree_sort': drivers/md/bcache/bset.c:1190: warning: dereferencing `void *' pointer drivers/md/bcache/bset.c:1190: error: request for member `virtual' in something not a structure or union Convert them to static inline functions to fix this. There are already plenty of users of struct page members inside , so there's no reason to keep them as macros. Signed-off-by: Geert Uytterhoeven Acked-by: Michael S. Tsirkin Tested-by: Guenter Roeck Tested-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 35527173cf50..9fac6dd69b11 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -846,11 +846,14 @@ static __always_inline void *lowmem_page_address(const struct page *page) #endif #if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) +static inline void *page_address(const struct page *page) +{ + return page->virtual; +} +static inline void set_page_address(struct page *page, void *address) +{ + page->virtual = address; +} #define page_address_init() do { } while(0) #endif -- cgit 1.4.1 From 0e147aed4c250766e657855db55a395d2c8008a5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 21 Jan 2014 15:48:48 -0800 Subject: mm: hugetlbfs: Add some VM_BUG_ON()s to catch non-hugetlbfs pages Dave Jiang reported that he was seeing oopses when running NUMA systems and default_hugepagesz=1G. I traced the issue down to migrate_page_copy() trying to use the same code for hugetlb pages and transparent hugepages. It should not have been trying to pass thp pages in there. So, add some VM_BUG_ON()s for the next hapless VM developer that tries the same thing. Signed-off-by: Dave Hansen Reviewed-by: Naoya Horiguchi Tested-by: Dave Jiang Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bd7e98752222..251233c1494d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -360,6 +360,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, static inline struct hstate *page_hstate(struct page *page) { + VM_BUG_ON(!PageHuge(page)); return size_to_hstate(PAGE_SIZE << compound_order(page)); } -- cgit 1.4.1 From ca641514f4056deee1fb2eb356e2c99b98718ade Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:52 -0800 Subject: mm: thp: optimize compound_trans_huge Currently we don't clobber page_tail->first_page during split_huge_page, so compound_trans_head can be set to compound_head without adverse effects, and this mostly optimizes away a smp_rmb. It looks worthwhile to keep around the implementation that doesn't relay on page_tail->first_page not to be clobbered, because it would be necessary if we'll decide to enforce page->private to zero at all times whenever PG_private is not set, also for anonymous pages. For anonymous pages enforcing such an invariant doesn't matter as anonymous pages don't use page->private so we can get away with this microoptimization. Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 91672e2deec3..db512014e061 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -157,6 +157,26 @@ static inline int hpage_nr_pages(struct page *page) return HPAGE_PMD_NR; return 1; } +/* + * compound_trans_head() should be used instead of compound_head(), + * whenever the "page" passed as parameter could be the tail of a + * transparent hugepage that could be undergoing a + * __split_huge_page_refcount(). The page structure layout often + * changes across releases and it makes extensive use of unions. So if + * the page structure layout will change in a way that + * page->first_page gets clobbered by __split_huge_page_refcount, the + * implementation making use of smp_rmb() will be required. + * + * Currently we define compound_trans_head as compound_head, because + * page->private is in the same union with page->first_page, and + * page->private isn't clobbered. However this also means we're + * currently leaving dirt into the page->private field of anonymous + * pages resulting from a THP split, instead of setting page->private + * to zero like for every other page that has PG_private not set. But + * anonymous pages don't use page->private so this is not a problem. + */ +#if 0 +/* This will be needed if page->private will be clobbered in split_huge_page */ static inline struct page *compound_trans_head(struct page *page) { if (PageTail(page)) { @@ -174,6 +194,9 @@ static inline struct page *compound_trans_head(struct page *page) } return page; } +#else +#define compound_trans_head(page) compound_head(page) +#endif extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp); -- cgit 1.4.1 From 44518d2b32646e37b4b7a0813bbbe98dc21c7f8f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:54 -0800 Subject: mm: tail page refcounting optimization for slab and hugetlbfs This skips the _mapcount mangling for slab and hugetlbfs pages. The main trouble in doing this is to guarantee that PageSlab and PageHeadHuge remains constant for all get_page/put_page run on the tail of slab or hugetlbfs compound pages. Otherwise if they're set during get_page but not set during put_page, the _mapcount of the tail page would underflow. PageHeadHuge will remain true until the compound page is released and enters the buddy allocator so it won't risk to change even if the tail page is the last reference left on the page. PG_slab instead is cleared before the slab frees the head page with put_page, so if the tail pin is released after the slab freed the page, we would have a problem. But in the slab case the tail pin cannot be the last reference left on the page. This is because the slab code is free to reuse the compound page after a kfree/kmem_cache_free without having to check if there's any tail pin left. In turn all tail pins must be always released while the head is still pinned by the slab code and so we know PG_slab will be still set too. Signed-off-by: Andrea Arcangeli Reviewed-by: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ------ include/linux/mm.h | 32 +++++++++++++++++++++++++++++++- mm/internal.h | 3 ++- mm/swap.c | 33 +++++++++++++++++++++++++++------ 4 files changed, 60 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 251233c1494d..d01cc972a1d9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -31,7 +31,6 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); int PageHuge(struct page *page); -int PageHeadHuge(struct page *page_head); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -104,11 +103,6 @@ static inline int PageHuge(struct page *page) return 0; } -static inline int PageHeadHuge(struct page *page_head) -{ - return 0; -} - static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 9fac6dd69b11..f95c71b7c1fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -414,15 +414,45 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } +#ifdef CONFIG_HUGETLB_PAGE +extern int PageHeadHuge(struct page *page_head); +#else /* CONFIG_HUGETLB_PAGE */ +static inline int PageHeadHuge(struct page *page_head) +{ + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static inline bool __compound_tail_refcounted(struct page *page) +{ + return !PageSlab(page) && !PageHeadHuge(page); +} + +/* + * This takes a head page as parameter and tells if the + * tail page reference counting can be skipped. + * + * For this to be safe, PageSlab and PageHeadHuge must remain true on + * any given page where they return true here, until all tail pins + * have been released. + */ +static inline bool compound_tail_refcounted(struct page *page) +{ + VM_BUG_ON(!PageHead(page)); + return __compound_tail_refcounted(page); +} + static inline void get_huge_page_tail(struct page *page) { /* * __split_huge_page_refcount() cannot run * from under us. + * In turn no need of compound_trans_head here. */ VM_BUG_ON(page_mapcount(page) < 0); VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); + if (compound_tail_refcounted(compound_head(page))) + atomic_inc(&page->_mapcount); } extern bool __get_page_tail(struct page *page); diff --git a/mm/internal.h b/mm/internal.h index 684f7aa9692a..a85a3ab1f7ef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,7 +51,8 @@ static inline void __get_page_tail_foll(struct page *page, VM_BUG_ON(page_mapcount(page) < 0); if (get_page_head) atomic_inc(&page->first_page->_count); - atomic_inc(&page->_mapcount); + if (compound_tail_refcounted(page->first_page)) + atomic_inc(&page->_mapcount); } /* diff --git a/mm/swap.c b/mm/swap.c index e2757fbb04ea..bba4aa5bf686 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -88,8 +88,9 @@ static void put_compound_page(struct page *page) /* * THP can not break up slab pages so avoid taking - * compound_lock(). Slab performs non-atomic bit ops - * on page->flags for better performance. In + * compound_lock() and skip the tail page refcounting + * (in _mapcount) too. Slab performs non-atomic bit + * ops on page->flags for better performance. In * particular slab_unlock() in slub used to be a hot * path. It is still hot on arches that do not support * this_cpu_cmpxchg_double(). @@ -102,7 +103,7 @@ static void put_compound_page(struct page *page) * PageTail clear after smp_rmb() and we'll treat it * as a single page. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (!__compound_tail_refcounted(page_head)) { /* * If "page" is a THP tail, we must read the tail page * flags after the head page flags. The @@ -117,10 +118,30 @@ static void put_compound_page(struct page *page) * cannot race here. */ VM_BUG_ON(!PageHead(page_head)); - VM_BUG_ON(page_mapcount(page) <= 0); - atomic_dec(&page->_mapcount); - if (put_page_testzero(page_head)) + VM_BUG_ON(page_mapcount(page) != 0); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a + * slab compound page, the + * tail pin must not be the + * last reference held on the + * page, because the PG_slab + * cannot be cleared before + * all tail pins (which skips + * the _mapcount tail + * refcounting) have been + * released. For hugetlbfs the + * tail pin may be the last + * reference on the page + * instead, because + * PageHeadHuge will not go + * away until the compound + * page enters the buddy + * allocator. + */ + VM_BUG_ON(PageSlab(page_head)); __put_compound_page(page_head); + } return; } else /* -- cgit 1.4.1 From 5eaf1a9e233d61438377f57facb167f8208ba9fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:49:04 -0800 Subject: mm: thp: turn compound_head() into BUG_ON(!PageTail) in get_huge_page_tail() get_huge_page_tail()->compound_head() looks confusing. Every caller must check PageTail(page), otherwise atomic_inc(&page->_mapcount) is simply wrong if this page is compound-trans-head. Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Dave Jones Cc: Darren Hart Cc: Peter Zijlstra Cc: Mel Gorman Acked-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f95c71b7c1fd..58202c26c559 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -445,13 +445,12 @@ static inline bool compound_tail_refcounted(struct page *page) static inline void get_huge_page_tail(struct page *page) { /* - * __split_huge_page_refcount() cannot run - * from under us. - * In turn no need of compound_trans_head here. + * __split_huge_page_refcount() cannot run from under us. */ + VM_BUG_ON(!PageTail(page)); VM_BUG_ON(page_mapcount(page) < 0); VM_BUG_ON(atomic_read(&page->_count) != 0); - if (compound_tail_refcounted(compound_head(page))) + if (compound_tail_refcounted(page->first_page)) atomic_inc(&page->_mapcount); } -- cgit 1.4.1 From 943dca1a1fcbccb58de944669b833fd38a6c809b Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 21 Jan 2014 15:49:06 -0800 Subject: mm: get rid of unnecessary pageblock scanning in setup_zone_migrate_reserve Yasuaki Ishimatsu reported memory hot-add spent more than 5 _hours_ on 9TB memory machine since onlining memory sections is too slow. And we found out setup_zone_migrate_reserve spent >90% of the time. The problem is, setup_zone_migrate_reserve scans all pageblocks unconditionally, but it is only necessary if the number of reserved block was reduced (i.e. memory hot remove). Moreover, maximum MIGRATE_RESERVE per zone is currently 2. It means that the number of reserved pageblocks is almost always unchanged. This patch adds zone->nr_migrate_reserve_block to maintain the number of MIGRATE_RESERVE pageblocks and it reduces the overhead of setup_zone_migrate_reserve dramatically. The following table shows time of onlining a memory section. Amount of memory | 128GB | 192GB | 256GB| --------------------------------------------- linux-3.12 | 23.9 | 31.4 | 44.5 | This patch | 8.3 | 8.3 | 8.6 | Mel's proposal patch | 10.9 | 19.2 | 31.3 | --------------------------------------------- (millisecond) 128GB : 4 nodes and each node has 32GB of memory 192GB : 6 nodes and each node has 32GB of memory 256GB : 8 nodes and each node has 32GB of memory (*1) Mel proposed his idea by the following threads. https://lkml.org/lkml/2013/10/30/272 [akpm@linux-foundation.org: tweak comment] Signed-off-by: KOSAKI Motohiro Signed-off-by: Yasuaki Ishimatsu Reported-by: Yasuaki Ishimatsu Tested-by: Yasuaki Ishimatsu Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ mm/page_alloc.c | 13 +++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bd791e452ad7..67ab5febabf7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -489,6 +489,12 @@ struct zone { unsigned long present_pages; unsigned long managed_pages; + /* + * Number of MIGRATE_RESEVE page block. To maintain for just + * optimization. Protected by zone->lock. + */ + int nr_migrate_reserve_block; + /* * rarely used fields: */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe070aa4..89d81f4429ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3901,6 +3901,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) struct page *page; unsigned long block_migratetype; int reserve; + int old_reserve; /* * Get the start pfn, end pfn and the number of blocks to reserve @@ -3922,6 +3923,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) * future allocation of hugepages at runtime. */ reserve = min(2, reserve); + old_reserve = zone->nr_migrate_reserve_block; + + /* When memory hot-add, we almost always need to do nothing */ + if (reserve == old_reserve) + return; + zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) @@ -3959,6 +3966,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve--; continue; } + } else if (!old_reserve) { + /* + * At boot time we don't need to scan the whole zone + * for turning off MIGRATE_RESERVE. + */ + break; } /* -- cgit 1.4.1 From b35f1819acd9243a3ff7ad25b1fa8bd6bfe80fb2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 21 Jan 2014 15:49:07 -0800 Subject: mm: create a separate slab for page->ptl allocation If DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC are enabled spinlock_t on x86_64 is 72 bytes. For page->ptl they will be allocated from kmalloc-96 slab, so we loose 24 on each. An average system can easily allocate few tens thousands of page->ptl and overhead is significant. Let's create a separate slab for page->ptl allocation to solve this. To make sure that it really works this time, some numbers from my test machine (just booted, no load): Before: # grep '^\(kmalloc-96\|page->ptl\)' /proc/slabinfo kmalloc-96 31987 32190 128 30 1 : tunables 120 60 8 : slabdata 1073 1073 92 After: # grep '^\(kmalloc-96\|page->ptl\)' /proc/slabinfo page->ptl 27516 28143 72 53 1 : tunables 120 60 8 : slabdata 531 531 9 kmalloc-96 3853 5280 128 30 1 : tunables 120 60 8 : slabdata 176 176 0 Note that the patch is useful not only for debug case, but also for PREEMPT_RT, where spinlock_t is always bloated. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++++++++ init/main.c | 2 +- mm/memory.c | 13 +++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 58202c26c559..fc4415256ec3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1350,6 +1350,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS +void __init ptlock_cache_init(void); extern bool ptlock_alloc(struct page *page); extern void ptlock_free(struct page *page); @@ -1358,6 +1359,10 @@ static inline spinlock_t *ptlock_ptr(struct page *page) return page->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ +static inline void ptlock_cache_init(void) +{ +} + static inline bool ptlock_alloc(struct page *page) { return true; @@ -1410,10 +1415,17 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct page *page) { return true; } static inline void pte_lock_deinit(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ +static inline void pgtable_init(void) +{ + ptlock_cache_init(); + pgtable_cache_init(); +} + static inline bool pgtable_page_ctor(struct page *page) { inc_zone_page_state(page, NR_PAGETABLE); diff --git a/init/main.c b/init/main.c index febc511e078a..01573fdfa186 100644 --- a/init/main.c +++ b/init/main.c @@ -476,7 +476,7 @@ static void __init mm_init(void) mem_init(); kmem_cache_init(); percpu_init_late(); - pgtable_cache_init(); + pgtable_init(); vmalloc_init(); } diff --git a/mm/memory.c b/mm/memory.c index e9c550484ba6..86487dfa5e59 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4275,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + bool ptlock_alloc(struct page *page) { spinlock_t *ptl; - ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; page->ptl = ptl; @@ -4288,6 +4297,6 @@ bool ptlock_alloc(struct page *page) void ptlock_free(struct page *page) { - kfree(page->ptl); + kmem_cache_free(page_ptl_cachep, page->ptl); } #endif -- cgit 1.4.1 From d80be7c75136fb58ed7264ac8a49dd917ace77a1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 21 Jan 2014 15:49:10 -0800 Subject: mm, mempolicy: remove unneeded functions for UMA configs Mempolicies only exist for CONFIG_NUMA configurations. Therefore, a certain class of functions are unneeded in configurations where CONFIG_NUMA is disabled such as functions that duplicate existing mempolicies, lookup existing policies, set certain mempolicy traits, or test mempolicies for certain attributes. Remove the unneeded functions so that any future callers get a compile- time error and protect their code with CONFIG_NUMA as required. Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 9fe426b30a41..5f1ea756aace 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -211,20 +211,8 @@ static inline void mpol_get(struct mempolicy *pol) { } -static inline struct mempolicy *mpol_dup(struct mempolicy *old) -{ - return NULL; -} - struct shared_policy {}; -static inline int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, - struct mempolicy *new) -{ - return -EINVAL; -} - static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -234,12 +222,6 @@ static inline void mpol_free_shared_policy(struct shared_policy *p) { } -static inline struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) -{ - return NULL; -} - #define vma_policy(vma) NULL static inline int @@ -266,10 +248,6 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } -static inline void mpol_fix_fork_child_flag(struct task_struct *p) -{ -} - static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) @@ -284,12 +262,6 @@ static inline bool init_nodemask_of_mempolicy(nodemask_t *m) return false; } -static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, - const nodemask_t *mask) -{ - return false; -} - static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { @@ -307,10 +279,6 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol) } #endif -static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) -{ -} - static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long address) { -- cgit 1.4.1 From aec6a8889a98a0cd58357cd0937a25189908f191 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:49:13 -0800 Subject: mm, show_mem: remove SHOW_MEM_FILTER_PAGE_COUNT Commit 4b59e6c47309 ("mm, show_mem: suppress page counts in non-blockable contexts") introduced SHOW_MEM_FILTER_PAGE_COUNT to suppress PFN walks on large memory machines. Commit c78e93630d15 ("mm: do not walk all of system memory during show_mem") avoided a PFN walk in the generic show_mem helper which removes the requirement for SHOW_MEM_FILTER_PAGE_COUNT in that case. This patch removes PFN walkers from the arch-specific implementations that report on a per-node or per-zone granularity. ARM and unicore32 still do a PFN walk as they report memory usage on each bank which is a much finer granularity where the debugging information may still be of use. As the remaining arches doing PFN walks have relatively small amounts of memory, this patch simply removes SHOW_MEM_FILTER_PAGE_COUNT. [akpm@linux-foundation.org: fix parisc] Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Tony Luck Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/init.c | 3 --- arch/ia64/mm/contig.c | 68 ------------------------------------------------ arch/ia64/mm/discontig.c | 63 -------------------------------------------- arch/ia64/mm/init.c | 48 ++++++++++++++++++++++++++++++++++ arch/parisc/mm/init.c | 59 ++++++++++++----------------------------- arch/unicore32/mm/init.c | 3 --- include/linux/mm.h | 1 - lib/show_mem.c | 3 --- mm/page_alloc.c | 7 ----- 9 files changed, 65 insertions(+), 190 deletions(-) (limited to 'include') diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 3e8f106ee5fe..2e71e245df90 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -92,9 +92,6 @@ void show_mem(unsigned int filter) printk("Mem-info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_bank (i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index da5237d636d6..52715a71aede 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -31,74 +31,6 @@ static unsigned long max_gap; #endif -/** - * show_mem - give short summary of memory stats - * - * Shows a simple page count of reserved and used pages in the system. - * For discontig machines, it does this on a per-pgdat basis. - */ -void show_mem(unsigned int filter) -{ - int i, total_reserved = 0; - int total_shared = 0, total_cached = 0; - unsigned long total_present = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - printk(KERN_INFO "Node memory in pages:\n"); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_online_pgdat(pgdat) { - unsigned long present; - unsigned long flags; - int shared = 0, cached = 0, reserved = 0; - int nid = pgdat->node_id; - - if (skip_free_areas_node(filter, nid)) - continue; - pgdat_resize_lock(pgdat, &flags); - present = pgdat->node_present_pages; - for(i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page; - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - if (pfn_valid(pgdat->node_start_pfn + i)) - page = pfn_to_page(pgdat->node_start_pfn + i); - else { -#ifdef CONFIG_VIRTUAL_MEM_MAP - if (max_gap < LARGE_GAP) - continue; -#endif - i = vmemmap_find_next_valid_pfn(nid, i) - 1; - continue; - } - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page)-1; - } - pgdat_resize_unlock(pgdat, &flags); - total_present += present; - total_reserved += reserved; - total_cached += cached; - total_shared += shared; - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " - "shrd: %10d, swpd: %10d\n", nid, - present, reserved, shared, cached); - } - printk(KERN_INFO "%ld pages of RAM\n", total_present); - printk(KERN_INFO "%d reserved pages\n", total_reserved); - printk(KERN_INFO "%d pages shared\n", total_shared); - printk(KERN_INFO "%d pages swap cached\n", total_cached); - printk(KERN_INFO "Total of %ld pages in page table cache\n", - quicklist_total_size()); - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); -} - - /* physical address where the bootmem map is located */ unsigned long bootmap_start; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 2de08f4d9930..878626805369 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -607,69 +607,6 @@ void *per_cpu_init(void) } #endif /* CONFIG_SMP */ -/** - * show_mem - give short summary of memory stats - * - * Shows a simple page count of reserved and used pages in the system. - * For discontig machines, it does this on a per-pgdat basis. - */ -void show_mem(unsigned int filter) -{ - int i, total_reserved = 0; - int total_shared = 0, total_cached = 0; - unsigned long total_present = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - printk(KERN_INFO "Node memory in pages:\n"); - for_each_online_pgdat(pgdat) { - unsigned long present; - unsigned long flags; - int shared = 0, cached = 0, reserved = 0; - int nid = pgdat->node_id; - - if (skip_free_areas_node(filter, nid)) - continue; - pgdat_resize_lock(pgdat, &flags); - present = pgdat->node_present_pages; - for(i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page; - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - if (pfn_valid(pgdat->node_start_pfn + i)) - page = pfn_to_page(pgdat->node_start_pfn + i); - else { - i = vmemmap_find_next_valid_pfn(nid, i) - 1; - continue; - } - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page)-1; - } - pgdat_resize_unlock(pgdat, &flags); - total_present += present; - total_reserved += reserved; - total_cached += cached; - total_shared += shared; - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " - "shrd: %10d, swpd: %10d\n", nid, - present, reserved, shared, cached); - } - printk(KERN_INFO "%ld pages of RAM\n", total_present); - printk(KERN_INFO "%d reserved pages\n", total_reserved); - printk(KERN_INFO "%d pages shared\n", total_shared); - printk(KERN_INFO "%d pages swap cached\n", total_cached); - printk(KERN_INFO "Total of %ld pages in page table cache\n", - quicklist_total_size()); - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); -} - /** * call_pernode_memory - use SRAT to call callback functions with node info * @start: physical start of range diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 88504abf5704..25c350264a41 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -684,3 +684,51 @@ per_linux32_init(void) } __initcall(per_linux32_init); + +/** + * show_mem - give short summary of memory stats + * + * Shows a simple page count of reserved and used pages in the system. + * For discontig machines, it does this on a per-pgdat basis. + */ +void show_mem(unsigned int filter) +{ + int total_reserved = 0; + unsigned long total_present = 0; + pg_data_t *pgdat; + + printk(KERN_INFO "Mem-info:\n"); + show_free_areas(filter); + printk(KERN_INFO "Node memory in pages:\n"); + for_each_online_pgdat(pgdat) { + unsigned long present; + unsigned long flags; + int reserved = 0; + int nid = pgdat->node_id; + int zoneid; + + if (skip_free_areas_node(filter, nid)) + continue; + pgdat_resize_lock(pgdat, &flags); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + reserved += zone->present_pages - zone->managed_pages; + } + present = pgdat->node_present_pages; + + pgdat_resize_unlock(pgdat, &flags); + total_present += present; + total_reserved += reserved; + printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ", + nid, present, reserved); + } + printk(KERN_INFO "%ld pages of RAM\n", total_present); + printk(KERN_INFO "%d reserved pages\n", total_reserved); + printk(KERN_INFO "Total of %ld pages in page table cache\n", + quicklist_total_size()); + printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); +} diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 96f8168cf4ec..ae085ad0fba0 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -645,55 +645,30 @@ EXPORT_SYMBOL(empty_zero_page); void show_mem(unsigned int filter) { - int i,free = 0,total = 0,reserved = 0; - int shared = 0, cached = 0; + int total = 0,reserved = 0; + pg_data_t *pgdat; printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; -#ifndef CONFIG_DISCONTIGMEM - i = max_mapnr; - while (i-- > 0) { - total++; - if (PageReserved(mem_map+i)) - reserved++; - else if (PageSwapCache(mem_map+i)) - cached++; - else if (!page_count(&mem_map[i])) - free++; - else - shared += page_count(&mem_map[i]) - 1; - } -#else - for (i = 0; i < npmem_ranges; i++) { - int j; - for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { - struct page *p; - unsigned long flags; - - pgdat_resize_lock(NODE_DATA(i), &flags); - p = nid_page_nr(i, j) - node_start_pfn(i); - - total++; - if (PageReserved(p)) - reserved++; - else if (PageSwapCache(p)) - cached++; - else if (!page_count(p)) - free++; - else - shared += page_count(p) - 1; - pgdat_resize_unlock(NODE_DATA(i), &flags); - } + for_each_online_pgdat(pgdat) { + unsigned long flags; + int zoneid; + + pgdat_resize_lock(pgdat, &flags); + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + total += zone->present_pages; + reserved = zone->present_pages - zone->managed_pages; + } + pgdat_resize_unlock(pgdat, &flags); } -#endif + printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d reserved pages\n", reserved); - printk(KERN_INFO "%d pages shared\n", shared); - printk(KERN_INFO "%d pages swap cached\n", cached); - #ifdef CONFIG_DISCONTIGMEM { diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index ae6bc036db92..be2bde9b07cf 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -66,9 +66,6 @@ void show_mem(unsigned int filter) printk(KERN_DEFAULT "Mem-info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_bank(i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; diff --git a/include/linux/mm.h b/include/linux/mm.h index fc4415256ec3..4c0c01afc19b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1016,7 +1016,6 @@ extern void pagefault_out_of_memory(void); * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); diff --git a/lib/show_mem.c b/lib/show_mem.c index 5847a4921b8e..f58689f5a24e 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -17,9 +17,6 @@ void show_mem(unsigned int filter) printk("Mem-Info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_online_pgdat(pgdat) { unsigned long flags; int zoneid; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d81f4429ca..ec4417cb458a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2071,13 +2071,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) debug_guardpage_minorder() > 0) return; - /* - * Walking all memory to count page types is very expensive and should - * be inhibited in non-blockable contexts. - */ - if (!(gfp_mask & __GFP_WAIT)) - filter |= SHOW_MEM_FILTER_PAGE_COUNT; - /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set -- cgit 1.4.1 From 49f0ce5f92321cdcf741e35f385669a421013cb7 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 21 Jan 2014 15:49:14 -0800 Subject: mm: add overcommit_kbytes sysctl variable Some applications that run on HPC clusters are designed around the availability of RAM and the overcommit ratio is fine tuned to get the maximum usage of memory without swapping. With growing memory, the 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse for these workload (on a 2TB machine it represents no less than 20GB). This patch adds the new overcommit_kbytes sysctl variable that allow a much finer grain. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Jerome Marchand Cc: Dave Hansen Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++++++++ Documentation/vm/overcommit-accounting | 7 ++++--- include/linux/mm.h | 9 +++++++++ include/linux/mman.h | 1 + kernel/sysctl.c | 11 ++++++++--- mm/mmap.c | 1 + mm/nommu.c | 1 + mm/util.c | 36 ++++++++++++++++++++++++++++++++-- 8 files changed, 70 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 1fbd4eb7b64a..9f5481bdc5a4 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -47,6 +47,7 @@ Currently, these files are in /proc/sys/vm: - numa_zonelist_order - oom_dump_tasks - oom_kill_allocating_task +- overcommit_kbytes - overcommit_memory - overcommit_ratio - page-cluster @@ -574,6 +575,17 @@ The default value is 0. ============================================================== +overcommit_kbytes: + +When overcommit_memory is set to 2, the committed address space is not +permitted to exceed swap plus this amount of physical RAM. See below. + +Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one +of them may be specified at a time. Setting one disables the other (which +then appears as 0 when read). + +============================================================== + overcommit_memory: This value contains a flag that enables memory overcommitment. diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting index 8eaa2fc4b8fa..cbfaaa674118 100644 --- a/Documentation/vm/overcommit-accounting +++ b/Documentation/vm/overcommit-accounting @@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes 2 - Don't overcommit. The total address space commit for the system is not permitted to exceed swap + a - configurable percentage (default is 50) of physical RAM. - Depending on the percentage you use, in most situations + configurable amount (default is 50%) of physical RAM. + Depending on the amount you use, in most situations this means a process will not be killed while accessing pages but will receive errors on memory allocation as appropriate. @@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes The overcommit policy is set via the sysctl `vm.overcommit_memory'. -The overcommit percentage is set via `vm.overcommit_ratio'. +The overcommit amount can be set via `vm.overcommit_ratio' (percentage) +or `vm.overcommit_kbytes' (absolute value). The current overcommit limit and amount committed are viewable in /proc/meminfo as CommitLimit and Committed_AS respectively. diff --git a/include/linux/mm.h b/include/linux/mm.h index 4c0c01afc19b..a512dd836931 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -57,6 +57,15 @@ extern int sysctl_legacy_va_layout; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern unsigned long sysctl_overcommit_kbytes; + +extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); +extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* to align the pointer to the (next) page boundary */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 7f7f8dae4b1d..16373c8f5f57 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -9,6 +9,7 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern unsigned long sysctl_overcommit_kbytes; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f905cf..332cefcdb04b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,8 +95,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP @@ -1121,7 +1119,14 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, }, { .procname = "page-cluster", diff --git a/mm/mmap.c b/mm/mmap.c index 834b2d785f1e..39552de6e1db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ diff --git a/mm/util.c b/mm/util.c index 808f375648e7..a24aa22f2473 100644 --- a/mm/util.c +++ b/mm/util.c @@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) return mapping; } +int overcommit_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { - return ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; } -- cgit 1.4.1 From 66a20757214d94b915f2d2aada1384dead9ab18d Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:20 -0800 Subject: memblock, numa: introduce flags field into memblock There is no flag in memblock to describe what type the memory is. Sometimes, we may use memblock to reserve some memory for special usage. And we want to know what kind of memory it is. So we need a way to In hotplug environment, we want to reserve hotpluggable memory so the kernel won't be able to use it. And when the system is up, we have to free these hotpluggable memory to buddy. So we need to mark these memory first. In order to do so, we need to mark out these special memory in memblock. In this patch, we introduce a new "flags" member into memblock_region: struct memblock_region { phys_addr_t base; phys_addr_t size; unsigned long flags; /* This is new. */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int nid; #endif }; This patch does the following things: 1) Add "flags" member to memblock_region. 2) Modify the following APIs' prototype: memblock_add_region() memblock_insert_region() 3) Add memblock_reserve_region() to support reserve memory with flags, and keep memblock_reserve()'s prototype unmodified. 4) Modify other APIs to support flags, but keep their prototype unmodified. The idea is from Wen Congyang and Liu Jiang . Suggested-by: Wen Congyang Suggested-by: Liu Jiang Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 + mm/memblock.c | 53 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 39 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 77c60e52939d..9a805ec6e794 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -22,6 +22,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; + unsigned long flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int nid; #endif diff --git a/mm/memblock.c b/mm/memblock.c index aab566998b61..270b005ca964 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -255,6 +255,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; + type->regions[0].flags = 0; memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } @@ -405,7 +406,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) if (this->base + this->size != next->base || memblock_get_region_node(this) != - memblock_get_region_node(next)) { + memblock_get_region_node(next) || + this->flags != next->flags) { BUG_ON(this->base + this->size > next->base); i++; continue; @@ -425,13 +427,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @base: base address of the new region * @size: size of the new region * @nid: node id of the new region + * @flags: flags of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. * @type must already have extra room to accomodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, - phys_addr_t size, int nid) + phys_addr_t size, + int nid, unsigned long flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -439,6 +443,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); rgn->base = base; rgn->size = size; + rgn->flags = flags; memblock_set_region_node(rgn, nid); type->cnt++; type->total_size += size; @@ -450,6 +455,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region + * @flags: flags of the new region * * Add new memblock region [@base,@base+@size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already @@ -460,7 +466,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * 0 on success, -errno on failure. */ static int __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, int nid) + phys_addr_t base, phys_addr_t size, + int nid, unsigned long flags) { bool insert = false; phys_addr_t obase = base; @@ -475,6 +482,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, WARN_ON(type->cnt != 1 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; + type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; return 0; @@ -505,7 +513,8 @@ repeat: nr_new++; if (insert) memblock_insert_region(type, i++, base, - rbase - base, nid); + rbase - base, nid, + flags); } /* area below @rend is dealt with, forget about it */ base = min(rend, end); @@ -515,7 +524,8 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, nid); + memblock_insert_region(type, i, base, end - base, + nid, flags); } /* @@ -537,12 +547,13 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid); + return memblock_add_region(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); + return memblock_add_region(&memblock.memory, base, size, + MAX_NUMNODES, 0); } /** @@ -597,7 +608,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= base - rbase; type->total_size -= base - rbase; memblock_insert_region(type, i, rbase, base - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else if (rend > end) { /* * @rgn intersects from above. Split and redo the @@ -607,7 +619,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= end - rbase; type->total_size -= end - rbase; memblock_insert_region(type, i--, rbase, end - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) @@ -649,16 +662,24 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) return __memblock_remove(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +static int __init_memblock memblock_reserve_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) { struct memblock_type *_rgn = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, - (void *)_RET_IP_); + flags, (void *)_RET_IP_); + + return memblock_add_region(_rgn, base, size, nid, flags); +} - return memblock_add_region(_rgn, base, size, MAX_NUMNODES); +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return memblock_reserve_region(base, size, MAX_NUMNODES, 0); } /** @@ -1101,6 +1122,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; + unsigned long flags; int i; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); @@ -1111,13 +1133,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name base = rgn->base; size = rgn->size; + flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", - name, i, base, base + size - 1, size, nid_buf); + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", + name, i, base, base + size - 1, size, nid_buf, flags); } } -- cgit 1.4.1 From 66b16edf9eafc3291cabb2253d0f342a847656b7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:23 -0800 Subject: memblock, mem_hotplug: introduce MEMBLOCK_HOTPLUG flag to mark hotpluggable regions In find_hotpluggable_memory, once we find out a memory region which is hotpluggable, we want to mark them in memblock.memory. So that we could control memblock allocator not to allocte hotpluggable memory for the kernel later. To achieve this goal, we introduce MEMBLOCK_HOTPLUG flag to indicate the hotpluggable memory regions in memblock and a function memblock_mark_hotplug() to mark hotpluggable memory if we find one. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 17 ++++++++++++++++ mm/memblock.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9a805ec6e794..b788faa71563 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -19,6 +19,9 @@ #define INIT_MEMBLOCK_REGIONS 128 +/* Definition of memblock flags. */ +#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ + struct memblock_region { phys_addr_t base; phys_addr_t size; @@ -60,6 +63,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); +int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); +int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, @@ -122,6 +127,18 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, i != (u64)ULLONG_MAX; \ __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) +static inline void memblock_set_region_flags(struct memblock_region *r, + unsigned long flags) +{ + r->flags |= flags; +} + +static inline void memblock_clear_region_flags(struct memblock_region *r, + unsigned long flags) +{ + r->flags &= ~flags; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); diff --git a/mm/memblock.c b/mm/memblock.c index 270b005ca964..2121ec4c7fa0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -682,6 +682,59 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) return memblock_reserve_region(base, size, MAX_NUMNODES, 0); } +/** + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and mark it with flag + * MEMBLOCK_HOTPLUG. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and clear flag + * MEMBLOCK_HOTPLUG for the isolated regions. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_clear_region_flags(&type->regions[i], + MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable -- cgit 1.4.1 From e7e8de5918dd6a07cbddae559600d7765ad6a56e Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:26 -0800 Subject: memblock: make memblock_set_node() support different memblock_type [sfr@canb.auug.org.au: fix powerpc build] Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/metag/mm/init.c | 3 ++- arch/metag/mm/numa.c | 3 ++- arch/microblaze/mm/init.c | 3 ++- arch/powerpc/mm/mem.c | 2 +- arch/powerpc/mm/numa.c | 8 +++++--- arch/sh/kernel/setup.c | 4 ++-- arch/sparc/mm/init_64.c | 5 +++-- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/numa.c | 6 ++++-- include/linux/memblock.h | 3 ++- mm/memblock.c | 6 +++--- 12 files changed, 28 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index 3cd6288f65c2..11fa51c89617 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -204,7 +204,8 @@ static void __init do_init_bootmem(void) start_pfn = memblock_region_memory_base_pfn(reg); end_pfn = memblock_region_memory_end_pfn(reg); memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), 0); + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, 0); } /* All of system RAM sits in node 0 for the non-NUMA case */ diff --git a/arch/metag/mm/numa.c b/arch/metag/mm/numa.c index b172aa45fcf8..67b46c295072 100644 --- a/arch/metag/mm/numa.c +++ b/arch/metag/mm/numa.c @@ -42,7 +42,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end) memblock_add(start, end - start); memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); /* Node-local pgdat */ pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data), diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 74c7bcc1e82d..89077d346714 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -192,7 +192,8 @@ void __init setup_memory(void) start_pfn = memblock_region_memory_base_pfn(reg); end_pfn = memblock_region_memory_end_pfn(reg); memblock_set_node(start_pfn << PAGE_SHIFT, - (end_pfn - start_pfn) << PAGE_SHIFT, 0); + (end_pfn - start_pfn) << PAGE_SHIFT, + &memblock.memory, 0); } /* free bootmem is whole main memory */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3fa93dc7fe75..8c1dd23652a1 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -209,7 +209,7 @@ void __init do_init_bootmem(void) /* Place all memblock_regions in the same node and merge contiguous * memblock_regions */ - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); /* Add all physical memory to the bootmem map, mark each area * present. diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 078d3e00a616..5a944f25e94f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -670,7 +670,8 @@ static void __init parse_drconf_memory(struct device_node *memory) node_set_online(nid); sz = numa_enforce_memory_limit(base, size); if (sz) - memblock_set_node(base, sz, nid); + memblock_set_node(base, sz, + &memblock.memory, nid); } while (--ranges); } } @@ -760,7 +761,7 @@ new_range: continue; } - memblock_set_node(start, size, nid); + memblock_set_node(start, size, &memblock.memory, nid); if (--ranges) goto new_range; @@ -797,7 +798,8 @@ static void __init setup_nonnuma(void) fake_numa_create_new_node(end_pfn, &nid); memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); node_set_online(nid); } } diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 1cf90e947dbf..de19cfa768f2 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -230,8 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn, pmb_bolt_mapping((unsigned long)__va(start), start, end - start, PAGE_KERNEL); - memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); } void __init __weak plat_early_device_setup(void) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5322e530d09c..eafbc65c9c47 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1021,7 +1021,8 @@ static void __init add_node_ranges(void) "start[%lx] end[%lx]\n", nid, start, this_end); - memblock_set_node(start, this_end - start, nid); + memblock_set_node(start, this_end - start, + &memblock.memory, nid); start = this_end; } } @@ -1325,7 +1326,7 @@ static void __init bootmem_init_nonnuma(void) (top_of_ram - total_ram) >> 20); init_node_masks_nonnuma(); - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); allocate_node_data(0); node_set_online(0); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5bdc5430597c..e39504878aec 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -665,7 +665,7 @@ void __init initmem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); sparse_memory_present_with_active_regions(0); #ifdef CONFIG_FLATMEM diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 104d56a9245f..f35c66c5959a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start, #ifndef CONFIG_NUMA void __init initmem_init(void) { - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); } #endif diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c85da7bb6b60..82e079a0d363 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -491,7 +491,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *mb = &mi->blk[i]; - memblock_set_node(mb->start, mb->end - mb->start, mb->nid); + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); } /* @@ -565,7 +566,8 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, + MAX_NUMNODES)); numa_reset_distance(); ret = init_func(); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b788faa71563..97480d392e40 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -140,7 +140,8 @@ static inline void memblock_clear_region_flags(struct memblock_region *r, } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); +int memblock_set_node(phys_addr_t base, phys_addr_t size, + struct memblock_type *type, int nid); static inline void memblock_set_region_node(struct memblock_region *r, int nid) { diff --git a/mm/memblock.c b/mm/memblock.c index 2121ec4c7fa0..d5681008dce1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -911,18 +911,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for * @size: size of area to set node ID for + * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. * Regions which cross the area boundaries are split as necessary. * * RETURNS: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, - int nid) + struct memblock_type *type, int nid) { - struct memblock_type *type = &memblock.memory; int start_rgn, end_rgn; int i, ret; -- cgit 1.4.1 From 55ac590c2fadad785d60dd70c12d62823bc2cd39 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:35 -0800 Subject: memblock, mem_hotplug: make memblock skip hotpluggable regions if needed Linux kernel cannot migrate pages used by the kernel. As a result, hotpluggable memory used by the kernel won't be able to be hot-removed. To solve this problem, the basic idea is to prevent memblock from allocating hotpluggable memory for the kernel at early time, and arrange all hotpluggable memory in ACPI SRAT(System Resource Affinity Table) as ZONE_MOVABLE when initializing zones. In the previous patches, we have marked hotpluggable memory regions with MEMBLOCK_HOTPLUG flag in memblock.memory. In this patch, we make memblock skip these hotpluggable memory regions in the default top-down allocation function if movable_node boot option is specified. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 24 ++++++++++++++++++++++++ mm/memblock.c | 12 ++++++++++++ mm/memory_hotplug.c | 1 + 3 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 97480d392e40..2f52c8c492bd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -47,6 +47,10 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; +#ifdef CONFIG_MOVABLE_NODE +/* If movable_node boot option specified */ +extern bool movable_node_enabled; +#endif /* CONFIG_MOVABLE_NODE */ #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) @@ -65,6 +69,26 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); +#ifdef CONFIG_MOVABLE_NODE +static inline bool memblock_is_hotpluggable(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_HOTPLUG; +} + +static inline bool movable_node_is_enabled(void) +{ + return movable_node_enabled; +} +#else +static inline bool memblock_is_hotpluggable(struct memblock_region *m) +{ + return false; +} +static inline bool movable_node_is_enabled(void) +{ + return false; +} +#endif #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, diff --git a/mm/memblock.c b/mm/memblock.c index d5681008dce1..6a2a48a122a9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -39,6 +39,9 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; +#ifdef CONFIG_MOVABLE_NODE +bool movable_node_enabled __initdata_memblock = false; +#endif static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; @@ -820,6 +823,11 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). + * + * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't + * be able to hot-remove hotpluggable memory used by the kernel. So this + * function skip hotpluggable regions if needed when allocating memory for the + * kernel. */ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, @@ -844,6 +852,10 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) continue; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + /* scan areas before each reservation for intersection */ for ( ; ri >= 0; ri--) { struct memblock_region *r = &rsv->regions[ri]; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235502db..01e39afde1cb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p) * the kernel away from hotpluggable memory. */ memblock_set_bottom_up(true); + movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); #endif -- cgit 1.4.1 From 051ac83adf69eea4f57a97356e4282e395a5fa6d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:48 -0800 Subject: mm/rmap: make rmap_walk to get the rmap_walk_control argument In each rmap traverse case, there is some difference so that we need function pointers and arguments to them in order to handle these For this purpose, struct rmap_walk_control is introduced in this patch, and will be extended in following patch. Introducing and extending are separate, because it clarify changes. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 7 +++---- include/linux/rmap.h | 9 +++++++-- mm/ksm.c | 6 +++--- mm/migrate.c | 7 ++++++- mm/rmap.c | 19 ++++++++----------- 5 files changed, 27 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 45c9b6a17bcb..0eef8cb0baf7 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -76,8 +76,7 @@ struct page *ksm_might_need_to_copy(struct page *page, int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags); int try_to_unmap_ksm(struct page *page, enum ttu_flags flags); -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg); +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); #else /* !CONFIG_KSM */ @@ -120,8 +119,8 @@ static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) return 0; } -static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*, - struct vm_area_struct *, unsigned long, void *), void *arg) +static inline int rmap_walk_ksm(struct page *page, + struct rmap_walk_control *rwc) { return 0; } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6dacb93a6d94..6a456ce6de20 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -235,11 +235,16 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page); void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); +struct rmap_walk_control { + void *arg; + int (*rmap_one)(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg); +}; + /* * Called by migrate.c to remove migration ptes, but might be used more later. */ -int rmap_walk(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg); +int rmap_walk(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/mm/ksm.c b/mm/ksm.c index 175fff79dc95..c3035fee8080 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1997,8 +1997,7 @@ out: } #ifdef CONFIG_MIGRATION -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; @@ -2033,7 +2032,8 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; - ret = rmap_one(page, vma, rmap_item->address, arg); + ret = rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg); if (ret != SWAP_AGAIN) { anon_vma_unlock_read(anon_vma); goto out; diff --git a/mm/migrate.c b/mm/migrate.c index 9194375b2307..11d89dc0574c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -199,7 +199,12 @@ out: */ static void remove_migration_ptes(struct page *old, struct page *new) { - rmap_walk(new, remove_migration_pte, old); + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = old, + }; + + rmap_walk(new, &rwc); } /* diff --git a/mm/rmap.c b/mm/rmap.c index 5a79bf585e27..f8f10ad5d359 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1706,8 +1706,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page) * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): * Called by migrate.c to remove migration ptes, but might be used more later. */ -static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1721,7 +1720,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; } @@ -1729,8 +1728,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, return ret; } -static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << compound_order(page); @@ -1742,7 +1740,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; } @@ -1755,17 +1753,16 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, return ret; } -int rmap_walk(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { VM_BUG_ON(!PageLocked(page)); if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rmap_one, arg); + return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rmap_one, arg); + return rmap_walk_anon(page, rwc); else - return rmap_walk_file(page, rmap_one, arg); + return rmap_walk_file(page, rwc); } #endif /* CONFIG_MIGRATION */ -- cgit 1.4.1 From 0dd1c7bbce8d1d142bb25aefaa50262dfd77cb78 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:49 -0800 Subject: mm/rmap: extend rmap_walk_xxx() to cope with different cases There are a lot of common parts in traversing functions, but there are also a little of uncommon parts in it. By assigning proper function pointer on each rmap_walker_control, we can handle these difference correctly. Following are differences we should handle. 1. difference of lock function in anon mapping case 2. nonlinear handling in file mapping case 3. prechecked condition: checking memcg in page_referenced(), checking VM_SHARE in page_mkclean() checking temporary vma in try_to_unmap() 4. exit condition: checking page_mapped() in try_to_unmap() So, in this patch, I introduce 4 function pointers to handle above differences. Signed-off-by: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 15 +++++++++++++++ mm/ksm.c | 7 +++++++ mm/rmap.c | 37 +++++++++++++++++++++++++++++-------- 3 files changed, 51 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6a456ce6de20..616aa4d05f0a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -235,10 +235,25 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page); void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); +/* + * rmap_walk_control: To control rmap traversing for specific needs + * + * arg: passed to rmap_one() and invalid_vma() + * rmap_one: executed on each vma where page is mapped + * done: for checking traversing termination condition + * file_nonlinear: for handling file nonlinear mapping + * anon_lock: for getting anon_lock by optimized way rather than default + * invalid_vma: for skipping uninterested vma + */ struct rmap_walk_control { void *arg; int (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); + int (*done)(struct page *page); + int (*file_nonlinear)(struct page *, struct address_space *, + struct vm_area_struct *vma); + struct anon_vma *(*anon_lock)(struct page *page); + bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; /* diff --git a/mm/ksm.c b/mm/ksm.c index c3035fee8080..91b8cb35f7cc 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2032,12 +2032,19 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, rmap_item->address, rwc->arg); if (ret != SWAP_AGAIN) { anon_vma_unlock_read(anon_vma); goto out; } + if (rwc->done && rwc->done(page)) { + anon_vma_unlock_read(anon_vma); + goto out; + } } anon_vma_unlock_read(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index f8f10ad5d359..97bf8f0396f8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1684,10 +1684,14 @@ void __put_anon_vma(struct anon_vma *anon_vma) } #ifdef CONFIG_MIGRATION -static struct anon_vma *rmap_walk_anon_lock(struct page *page) +static struct anon_vma *rmap_walk_anon_lock(struct page *page, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; + if (rwc->anon_lock) + return rwc->anon_lock(page); + /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages @@ -1713,16 +1717,22 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page); + anon_vma = rmap_walk_anon_lock(page, rwc); if (!anon_vma) return ret; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; + if (rwc->done && rwc->done(page)) + break; } anon_vma_unlock_read(anon_vma); return ret; @@ -1740,15 +1750,26 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) - break; + goto done; + if (rwc->done && rwc->done(page)) + goto done; } - /* - * No nonlinear handling: being always shared, nonlinear vmas - * never contain migration ptes. Decide what to do about this - * limitation to linear when we need rmap_walk() on nonlinear. - */ + + if (!rwc->file_nonlinear) + goto done; + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto done; + + ret = rwc->file_nonlinear(page, mapping, vma); + +done: mutex_unlock(&mapping->i_mmap_mutex); return ret; } -- cgit 1.4.1 From 52629506420ce32997f1fba0a1ab2f1aaa9a4f79 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:50 -0800 Subject: mm/rmap: use rmap_walk() in try_to_unmap() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in try_to_unmap(). In this patch, I change following things. 1. enable rmap_walk() if !CONFIG_MIGRATION. 2. mechanical change to use rmap_walk() in try_to_unmap(). Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 5 +---- mm/ksm.c | 4 ++-- mm/rmap.c | 48 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 39 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 616aa4d05f0a..2462458708cd 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -190,7 +190,7 @@ int page_referenced_one(struct page *, struct vm_area_struct *, int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, - unsigned long address, enum ttu_flags flags); + unsigned long address, void *arg); /* * Called from mm/filemap_xip.c to unmap empty zero page @@ -256,9 +256,6 @@ struct rmap_walk_control { bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -/* - * Called by migrate.c to remove migration ptes, but might be used more later. - */ int rmap_walk(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/mm/ksm.c b/mm/ksm.c index 91b8cb35f7cc..6b4baa97f4c0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1982,7 +1982,7 @@ again: continue; ret = try_to_unmap_one(page, vma, - rmap_item->address, flags); + rmap_item->address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { anon_vma_unlock_read(anon_vma); goto out; @@ -1996,7 +1996,6 @@ out: return ret; } -#ifdef CONFIG_MIGRATION int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; @@ -2054,6 +2053,7 @@ out: return ret; } +#ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { struct stable_node *stable_node; diff --git a/mm/rmap.c b/mm/rmap.c index 97bf8f0396f8..b3263cb32361 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1179,15 +1179,18 @@ out: /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. + * + * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, enum ttu_flags flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; + enum ttu_flags flags = (enum ttu_flags)arg; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1513,6 +1516,11 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) return false; } +static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) +{ + return is_vma_temporary_stack(vma); +} + /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method @@ -1558,7 +1566,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); + ret = try_to_unmap_one(page, vma, address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) break; } @@ -1592,7 +1600,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); + ret = try_to_unmap_one(page, vma, address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) goto out; } @@ -1614,6 +1622,11 @@ out: return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped @@ -1631,16 +1644,29 @@ out: int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, + .done = page_not_mapped, + .file_nonlinear = try_to_unmap_nonlinear, + .anon_lock = page_lock_anon_vma_read, + }; - BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); - if (unlikely(PageKsm(page))) - ret = try_to_unmap_ksm(page, flags); - else if (PageAnon(page)) - ret = try_to_unmap_anon(page, flags); - else - ret = try_to_unmap_file(page, flags); + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + rwc.invalid_vma = invalid_migration_vma; + + ret = rmap_walk(page, &rwc); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1683,7 +1709,6 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(anon_vma); } -#ifdef CONFIG_MIGRATION static struct anon_vma *rmap_walk_anon_lock(struct page *page, struct rmap_walk_control *rwc) { @@ -1785,7 +1810,6 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) else return rmap_walk_file(page, rwc); } -#endif /* CONFIG_MIGRATION */ #ifdef CONFIG_HUGETLB_PAGE /* -- cgit 1.4.1 From e8351ac9bfa7f4412d5d196b6742309473ca506d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:52 -0800 Subject: mm/rmap: use rmap_walk() in try_to_munlock() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in try_to_munlock(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> try_to_unmap_ksm, try_to_unmap_anon, try_to_unmap_file 2. mechanical change to use rmap_walk() in try_to_munlock(). 3. copy and paste comments. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 6 -- mm/ksm.c | 50 ----------------- mm/rmap.c | 154 ++++++++++++++-------------------------------------- 3 files changed, 42 insertions(+), 168 deletions(-) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0eef8cb0baf7..91b9719722c3 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -75,7 +75,6 @@ struct page *ksm_might_need_to_copy(struct page *page, int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags); -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags); int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); @@ -114,11 +113,6 @@ static inline int page_referenced_ksm(struct page *page, return 0; } -static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -{ - return 0; -} - static inline int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { diff --git a/mm/ksm.c b/mm/ksm.c index 6b4baa97f4c0..646d45a6b6c8 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1946,56 +1946,6 @@ out: return referenced; } -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return SWAP_FAIL; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - ret = try_to_unmap_one(page, vma, - rmap_item->address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) { - anon_vma_unlock_read(anon_vma); - goto out; - } - } - anon_vma_unlock_read(anon_vma); - } - if (!search_new_forks++) - goto again; -out: - return ret; -} - int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; diff --git a/mm/rmap.c b/mm/rmap.c index b3263cb32361..c73e0c645d09 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1177,9 +1177,6 @@ out: } /* - * Subfunctions of try_to_unmap: try_to_unmap_one called - * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. - * * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, @@ -1521,107 +1518,6 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -/** - * try_to_unmap_anon - unmap or unlock anonymous page using the object-based - * rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the anon_vma struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * anonymous pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) -{ - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return ret; - - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address; - - /* - * During exec, a temporary VMA is setup and later moved. - * The VMA is moved under the anon_vma lock but not the - * page tables leading to a race where migration cannot - * find the migration ptes. Rather than increasing the - * locking requirements of exec(), migration skips - * temporary VMAs until after exec() completes. - */ - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && - is_vma_temporary_stack(vma)) - continue; - - address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - break; - } - - page_unlock_anon_vma_read(anon_vma); - return ret; -} - -/** - * try_to_unmap_file - unmap/unlock file page using the object-based rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the address_space struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * object-based pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_file(struct page *page, enum ttu_flags flags) -{ - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << compound_order(page); - struct vm_area_struct *vma; - int ret = SWAP_AGAIN; - - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - goto out; - } - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto out; - - /* - * We don't bother to try to find the munlocked page in nonlinears. - * It's costly. Instead, later, page reclaim logic may call - * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. - */ - if (TTU_ACTION(flags) == TTU_MUNLOCK) - goto out; - - ret = try_to_unmap_nonlinear(page, mapping, vma); -out: - mutex_unlock(&mapping->i_mmap_mutex); - return ret; -} - static int page_not_mapped(struct page *page) { return !page_mapped(page); @@ -1689,14 +1585,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) */ int try_to_munlock(struct page *page) { + int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)TTU_MUNLOCK, + .done = page_not_mapped, + /* + * We don't bother to try to find the munlocked page in + * nonlinears. It's costly. Instead, later, page reclaim logic + * may call try_to_unmap() and recover PG_mlocked lazily. + */ + .file_nonlinear = NULL, + .anon_lock = page_lock_anon_vma_read, + + }; + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (unlikely(PageKsm(page))) - return try_to_unmap_ksm(page, TTU_MUNLOCK); - else if (PageAnon(page)) - return try_to_unmap_anon(page, TTU_MUNLOCK); - else - return try_to_unmap_file(page, TTU_MUNLOCK); + ret = rmap_walk(page, &rwc); + return ret; } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1732,8 +1639,18 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, } /* - * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): - * Called by migrate.c to remove migration ptes, but might be used more later. + * rmap_walk_anon - do something to anonymous page using the object-based + * rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. */ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { @@ -1763,6 +1680,19 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) return ret; } +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; -- cgit 1.4.1 From 9f32624be943538983eb0f18b73a9052d1493c80 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:53 -0800 Subject: mm/rmap: use rmap_walk() in page_referenced() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in page_referenced(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> page_referenced_ksm, page_referenced_anon, page_referenced_file 2. introduce new struct page_referenced_arg and pass it to page_referenced_one(), main function of rmap_walk, in order to count reference, to store vm_flags and to check finish condition. 3. mechanical change to use rmap_walk() in page_referenced(). [liwanp@linux.vnet.ibm.com: fix BUG at rmap_walk] Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Wanpeng Li Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 2 - include/linux/rmap.h | 2 +- mm/ksm.c | 60 ++------------- mm/rmap.c | 210 ++++++++++++++++++--------------------------------- 4 files changed, 80 insertions(+), 194 deletions(-) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 91b9719722c3..3be6bb18562d 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -73,8 +73,6 @@ static inline void set_page_stable_node(struct page *page, struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -int page_referenced_ksm(struct page *page, - struct mem_cgroup *memcg, unsigned long *vm_flags); int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 2462458708cd..1da693d51255 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -184,7 +184,7 @@ static inline void page_dup_rmap(struct page *page) int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); int page_referenced_one(struct page *, struct vm_area_struct *, - unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); + unsigned long address, void *arg); #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) diff --git a/mm/ksm.c b/mm/ksm.c index 646d45a6b6c8..3df141e5f3e0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1891,61 +1891,6 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - unsigned int mapcount = page_mapcount(page); - int referenced = 0; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return 0; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - - referenced += page_referenced_one(page, vma, - rmap_item->address, &mapcount, vm_flags); - if (!search_new_forks || !mapcount) - break; - } - anon_vma_unlock_read(anon_vma); - if (!mapcount) - goto out; - } - if (!search_new_forks++) - goto again; -out: - return referenced; -} - int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; @@ -1954,6 +1899,11 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) int search_new_forks = 0; VM_BUG_ON(!PageKsm(page)); + + /* + * Rely on the page lock to protect against concurrent modifications + * to that page's node of the stable tree. + */ VM_BUG_ON(!PageLocked(page)); stable_node = page_stable_node(page); diff --git a/mm/rmap.c b/mm/rmap.c index c73e0c645d09..080413036406 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +struct page_referenced_arg { + int mapcount; + int referenced; + unsigned long vm_flags; + struct mem_cgroup *memcg; +}; /* - * Subfunctions of page_referenced: page_referenced_one called - * repeatedly from either page_referenced_anon or page_referenced_file. + * arg: page_referenced_arg will be passed */ int page_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, unsigned int *mapcount, - unsigned long *vm_flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; int referenced = 0; + struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; @@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); if (!pmd) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { spin_unlock(ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } /* go ahead even if the pmd is pmd_trans_splitting() */ @@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } if (ptep_clear_flush_young_notify(vma, address, pte)) { @@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - (*mapcount)--; - - if (referenced) - *vm_flags |= vma->vm_flags; -out: - return referenced; -} - -static int page_referenced_anon(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - unsigned int mapcount; - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int referenced = 0; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return referenced; - - mapcount = page_mapcount(page); - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; + if (referenced) { + pra->referenced++; + pra->vm_flags |= vma->vm_flags; } - page_unlock_anon_vma_read(anon_vma); - return referenced; + pra->mapcount--; + if (!pra->mapcount) + return SWAP_SUCCESS; /* To break the loop */ + + return SWAP_AGAIN; } -/** - * page_referenced_file - referenced check for object-based rmap - * @page: the page we're checking references on. - * @memcg: target memory control group - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page - * - * For an object-based mapped page, find all the places it is mapped and - * check/clear the referenced flag. This is done by following the page->mapping - * pointer, then walking the chain of vmas it holds. It returns the number - * of references it found. - * - * This function is only called from page_referenced for object-based pages. - */ -static int page_referenced_file(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) +static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) { - unsigned int mapcount; - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - int referenced = 0; - - /* - * The caller's checks on page->mapping and !PageAnon have made - * sure that this is a file page: the check for page->mapping - * excludes the case just before it gets set on an anon page. - */ - BUG_ON(PageAnon(page)); + struct page_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; - /* - * The page lock not only makes sure that page->mapping cannot - * suddenly be NULLified by truncation, it makes sure that the - * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. - */ - BUG_ON(!PageLocked(page)); - - mutex_lock(&mapping->i_mmap_mutex); - - /* - * i_mmap_mutex does not stabilize mapcount at all, but mapcount - * is more likely to be accurate if we note it after spinning. - */ - mapcount = page_mapcount(page); - - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; - } + if (!mm_match_cgroup(vma->vm_mm, memcg)) + return true; - mutex_unlock(&mapping->i_mmap_mutex); - return referenced; + return false; } /** @@ -851,32 +768,47 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int referenced = 0; + int ret; int we_locked = 0; + struct page_referenced_arg pra = { + .mapcount = page_mapcount(page), + .memcg = memcg, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_referenced_one, + .arg = (void *)&pra, + .anon_lock = page_lock_anon_vma_read, + }; *vm_flags = 0; - if (page_mapped(page) && page_rmapping(page)) { - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { - we_locked = trylock_page(page); - if (!we_locked) { - referenced++; - goto out; - } - } - if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, memcg, - vm_flags); - else if (PageAnon(page)) - referenced += page_referenced_anon(page, memcg, - vm_flags); - else if (page->mapping) - referenced += page_referenced_file(page, memcg, - vm_flags); - if (we_locked) - unlock_page(page); + if (!page_mapped(page)) + return 0; + + if (!page_rmapping(page)) + return 0; + + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) + return 1; } -out: - return referenced; + + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (memcg) { + rwc.invalid_vma = invalid_page_referenced_vma; + } + + ret = rmap_walk(page, &rwc); + *vm_flags = pra.vm_flags; + + if (we_locked) + unlock_page(page); + + return pra.referenced; } static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, @@ -1700,6 +1632,14 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) struct vm_area_struct *vma; int ret = SWAP_AGAIN; + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_mutex. + */ + VM_BUG_ON(!PageLocked(page)); + if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); @@ -1731,8 +1671,6 @@ done: int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { - VM_BUG_ON(!PageLocked(page)); - if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) -- cgit 1.4.1 From 0c740d0afc3bff0a097ad03a1c8df92757516f5c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:49:56 -0800 Subject: introduce for_each_thread() to replace the buggy while_each_thread() while_each_thread() and next_thread() should die, almost every lockless usage is wrong. 1. Unless g == current, the lockless while_each_thread() is not safe. while_each_thread(g, t) can loop forever if g exits, next_thread() can't reach the unhashed thread in this case. Note that this can happen even if g is the group leader, it can exec. 2. Even if while_each_thread() itself was correct, people often use it wrongly. It was never safe to just take rcu_read_lock() and loop unless you verify that pid_alive(g) == T, even the first next_thread() can point to the already freed/reused memory. This patch adds signal_struct->thread_head and task->thread_node to create the normal rcu-safe list with the stable head. The new for_each_thread(g, t) helper is always safe under rcu_read_lock() as long as this task_struct can't go away. Note: of course it is ugly to have both task_struct->thread_node and the old task_struct->thread_group, we will kill it later, after we change the users of while_each_thread() to use for_each_thread(). Perhaps we can kill it even before we convert all users, we can reimplement next_thread(t) using the new thread_head/thread_node. But we can't do this right now because this will lead to subtle behavioural changes. For example, do/while_each_thread() always sees at least one task, while for_each_thread() can do nothing if the whole thread group has died. Or thread_group_empty(), currently its semantics is not clear unless thread_group_leader(p) and we need to audit the callers before we can change it. So this patch adds the new interface which has to coexist with the old one for some time, hopefully the next changes will be more or less straightforward and the old one will go away soon. Signed-off-by: Oleg Nesterov Reviewed-by: Sergey Dyasly Tested-by: Sergey Dyasly Reviewed-by: Sameer Nanda Acked-by: David Rientjes Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Cc: Michal Hocko Cc: "Tu, Xiaobing" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 2 ++ include/linux/sched.h | 12 ++++++++++++ kernel/exit.c | 1 + kernel/fork.c | 7 +++++++ 4 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f0e52383a001..1516a8ff8f92 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -41,6 +41,7 @@ extern struct fs_struct init_fs; #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ + .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ .shared_pending = { \ .list = LIST_HEAD_INIT(sig.shared_pending.list), \ @@ -222,6 +223,7 @@ extern struct task_group root_task_group; [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ }, \ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ INIT_IDS \ INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ diff --git a/include/linux/sched.h b/include/linux/sched.h index ffccdad050b5..485234d2fd42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -1271,6 +1272,7 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; + struct list_head thread_node; struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ @@ -2341,6 +2343,16 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/kernel/exit.c b/kernel/exit.c index a949819055d5..1e77fc645317 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 294189fc7ac8..2f11bbe376b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1035,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1474,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; -- cgit 1.4.1 From 10e89523bf5aade79081f501452fe7f1a16fa189 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:08 -0800 Subject: mm/bootmem: remove duplicated declaration of __free_pages_bootmem() The __free_pages_bootmem is used internally by MM core and already defined in internal.h. So, remove duplicated declaration. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Reviewed-by: Tejun Heo Cc: Yinghai Lu Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index f1f07d31a3af..55d52fb7ac1d 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -52,7 +52,6 @@ extern void free_bootmem_node(pg_data_t *pgdat, unsigned long size); extern void free_bootmem(unsigned long physaddr, unsigned long size); extern void free_bootmem_late(unsigned long physaddr, unsigned long size); -extern void __free_pages_bootmem(struct page *page, unsigned int order); /* * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, -- cgit 1.4.1 From 87029ee9390b2297dae699d5fb135b77992116e5 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:14 -0800 Subject: mm/memblock: reorder parameters of memblock_find_in_range_node Reorder parameters of memblock_find_in_range_node to be consistent with other memblock APIs. The change was suggested by Tejun Heo . Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 5 +++-- mm/memblock.c | 16 ++++++++-------- mm/nobootmem.c | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 2f52c8c492bd..11c31590cc49 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -55,8 +55,9 @@ extern bool movable_node_enabled; #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid); +phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end, + int nid); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); diff --git a/mm/memblock.c b/mm/memblock.c index 6aca54812db0..a95d6dc066d5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -157,10 +157,10 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, /** * memblock_find_in_range_node - find free area in given range and node - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @nid: nid of the free area to find, %MAX_NUMNODES for any node * * Find @size free area aligned to @align in the specified range and node. @@ -176,9 +176,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * RETURNS: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { int ret; phys_addr_t kernel_end; @@ -241,8 +241,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(start, end, size, align, - MAX_NUMNODES); + return memblock_find_in_range_node(size, align, start, end, + MAX_NUMNODES); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -975,7 +975,7 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); - found = memblock_find_in_range_node(0, max_addr, size, align, nid); + found = memblock_find_in_range_node(size, align, 0, max_addr, nid); if (found && !memblock_reserve(found, size)) return found; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d374655..59777e050d09 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(goal, limit, size, align, nid); + addr = memblock_find_in_range_node(size, align, goal, limit, nid); if (!addr) return NULL; -- cgit 1.4.1 From b115423357e0cda6d8f45d0c81df537d7b004020 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:16 -0800 Subject: mm/memblock: switch to use NUMA_NO_NODE instead of MAX_NUMNODES It's recommended to use NUMA_NO_NODE everywhere to select "process any node" behavior or to indicate that "no node id specified". Hence, update __next_free_mem_range*() API's to accept both NUMA_NO_NODE and MAX_NUMNODES, but emit warning once on MAX_NUMNODES, and correct corresponding API's documentation to describe new behavior. Also, update other memblock/nobootmem APIs where MAX_NUMNODES is used dirrectly. The change was suggested by Tejun Heo. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- mm/memblock.c | 28 +++++++++++++++++++--------- mm/nobootmem.c | 8 ++++---- 3 files changed, 25 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 11c31590cc49..cd0274bebd4c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -118,7 +118,7 @@ void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start, /** * for_each_free_mem_range - iterate through free memblock areas * @i: u64 used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL @@ -138,7 +138,7 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas * @i: u64 used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL diff --git a/mm/memblock.c b/mm/memblock.c index a95d6dc066d5..03f1dc7b663c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -126,7 +126,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -161,7 +161,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @align: alignment of free area to find * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Find @size free area aligned to @align in the specified range and node. * @@ -242,7 +242,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { return memblock_find_in_range_node(size, align, start, end, - MAX_NUMNODES); + NUMA_NO_NODE); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -754,7 +754,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -782,6 +782,11 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; + bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); + + if (nid == MAX_NUMNODES) + pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + __func__); for ( ; mi < mem->cnt; mi++) { struct memblock_region *m = &mem->regions[mi]; @@ -789,7 +794,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (check_node && nid != memblock_get_region_node(m)) continue; /* scan areas before each reservation for intersection */ @@ -830,7 +835,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, /** * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -850,6 +855,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; + bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); + + if (nid == MAX_NUMNODES) + pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + __func__); if (*idx == (u64)ULLONG_MAX) { mi = mem->cnt - 1; @@ -862,7 +872,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (check_node && nid != memblock_get_region_node(m)) continue; /* skip hotpluggable memory regions if needed */ @@ -989,7 +999,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 59777e050d09..19121ceb8874 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end, size; u64 i; - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); /* free range that is used for reserved array if we allocate it */ @@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ @@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, restart: - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; @@ -299,7 +299,7 @@ again: if (ptr) return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; -- cgit 1.4.1 From 26f09e9b3a0696f6fe20b021901300fba26fb579 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:19 -0800 Subject: mm/memblock: add memblock memory allocation apis Introduce memblock memory allocation APIs which allow to support PAE or LPAE extension on 32 bits archs where the physical memory start address can be beyond 4GB. In such cases, existing bootmem APIs which operate on 32 bit addresses won't work and needs memblock layer which operates on 64 bit addresses. So we add equivalent APIs so that we can replace usage of bootmem with memblock interfaces. Architectures already converted to NO_BOOTMEM use these new memblock interfaces. The architectures which are still not converted to NO_BOOTMEM continue to function as is because we still maintain the fal lback option of bootmem back-end supporting these new interfaces. So no functional change as such. In long run, once all the architectures moves to NO_BOOTMEM, we can get rid of bootmem layer completely. This is one step to remove the core code dependency with bootmem and also gives path for architectures to move away from bootmem. The proposed interface will became active if both CONFIG_HAVE_MEMBLOCK and CONFIG_NO_BOOTMEM are specified by arch. In case !CONFIG_NO_BOOTMEM, the memblock() wrappers will fallback to the existing bootmem apis so that arch's not converted to NO_BOOTMEM continue to work as is. The meaning of MEMBLOCK_ALLOC_ACCESSIBLE and MEMBLOCK_ALLOC_ANYWHERE is kept same. [akpm@linux-foundation.org: s/depricated/deprecated/] Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/dma.h | 4 +- include/linux/bootmem.h | 152 +++++++++++++++++++++++++++++++++ mm/memblock.c | 209 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 361 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index 58b8c6a0ab1f..99084431d6ae 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -8,8 +8,8 @@ #define MAX_DMA_ADDRESS 0xffffffffUL #else #define MAX_DMA_ADDRESS ({ \ - extern unsigned long arm_dma_zone_size; \ - arm_dma_zone_size ? \ + extern phys_addr_t arm_dma_zone_size; \ + arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \ (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; }) #endif diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 55d52fb7ac1d..2fae55def608 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -5,6 +5,7 @@ #define _LINUX_BOOTMEM_H #include +#include #include /* @@ -141,6 +142,157 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) + +#if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) + +/* FIXME: use MEMBLOCK_ALLOC_* variants here */ +#define BOOTMEM_ALLOC_ACCESSIBLE 0 +#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) + +/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ +void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, + phys_addr_t align, phys_addr_t min_addr, + phys_addr_t max_addr, int nid); +void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, int nid); +void __memblock_free_early(phys_addr_t base, phys_addr_t size); +void __memblock_free_late(phys_addr_t base, phys_addr_t size); + +static inline void * __init memblock_virt_alloc( + phys_addr_t size, phys_addr_t align) +{ + return memblock_virt_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_virt_alloc_nopanic( + phys_addr_t size, phys_addr_t align) +{ + return memblock_virt_alloc_try_nid_nopanic(size, align, + BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_virt_alloc_from_nopanic( + phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) +{ + return memblock_virt_alloc_try_nid_nopanic(size, align, min_addr, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_virt_alloc_node( + phys_addr_t size, int nid) +{ + return memblock_virt_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static inline void * __init memblock_virt_alloc_node_nopanic( + phys_addr_t size, int nid) +{ + return memblock_virt_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + nid); +} + +static inline void __init memblock_free_early( + phys_addr_t base, phys_addr_t size) +{ + __memblock_free_early(base, size); +} + +static inline void __init memblock_free_early_nid( + phys_addr_t base, phys_addr_t size, int nid) +{ + __memblock_free_early(base, size); +} + +static inline void __init memblock_free_late( + phys_addr_t base, phys_addr_t size) +{ + __memblock_free_late(base, size); +} + +#else + +#define BOOTMEM_ALLOC_ACCESSIBLE 0 + + +/* Fall back to all the existing bootmem APIs */ +static inline void * __init memblock_virt_alloc( + phys_addr_t size, phys_addr_t align) +{ + if (!align) + align = SMP_CACHE_BYTES; + return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_nopanic( + phys_addr_t size, phys_addr_t align) +{ + if (!align) + align = SMP_CACHE_BYTES; + return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_from_nopanic( + phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) +{ + return __alloc_bootmem_nopanic(size, align, min_addr); +} + +static inline void * __init memblock_virt_alloc_node( + phys_addr_t size, int nid) +{ + return __alloc_bootmem_node(NODE_DATA(nid), size, SMP_CACHE_BYTES, + BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_node_nopanic( + phys_addr_t size, int nid) +{ + return __alloc_bootmem_node_nopanic(NODE_DATA(nid), size, + SMP_CACHE_BYTES, + BOOTMEM_LOW_LIMIT); +} + +static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) +{ + return __alloc_bootmem_node_high(NODE_DATA(nid), size, align, + min_addr); +} + +static inline void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, int nid) +{ + return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, + min_addr, max_addr); +} + +static inline void __init memblock_free_early( + phys_addr_t base, phys_addr_t size) +{ + free_bootmem(base, size); +} + +static inline void __init memblock_free_early_nid( + phys_addr_t base, phys_addr_t size, int nid) +{ + free_bootmem_node(NODE_DATA(nid), base, size); +} + +static inline void __init memblock_free_late( + phys_addr_t base, phys_addr_t size) +{ + free_bootmem_late(base, size); +} +#endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */ + #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP extern void *alloc_remap(int nid, unsigned long size); #else diff --git a/mm/memblock.c b/mm/memblock.c index 03f1dc7b663c..018e55dc004d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -21,6 +21,9 @@ #include #include +#include + +#include "internal.h" static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -785,7 +788,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", __func__); for ( ; mi < mem->cnt; mi++) { @@ -858,7 +861,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", __func__); if (*idx == (u64)ULLONG_MAX) { @@ -1029,6 +1032,208 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } +/** + * memblock_virt_alloc_internal - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region to allocate (phys address) + * @max_addr: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Also, allocation may fall back + * to any node in the system if the specified node can not + * hold the requested memory. + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. + * + * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * + * The phys address of allocated boot memory block is converted to virtual and + * allocated memory is reset to 0. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc for + * allocated boot memory block, so that it is never reported as leaks. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +static void * __init memblock_virt_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + phys_addr_t alloc; + void *ptr; + + if (nid == MAX_NUMNODES) + pr_warn("%s: usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE\n", + __func__); + + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of free_all_bootmem) + */ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, nid); + + if (!align) + align = SMP_CACHE_BYTES; + + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); + +again: + alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, + nid); + if (alloc) + goto done; + + if (nid != NUMA_NO_NODE) { + alloc = memblock_find_in_range_node(size, align, min_addr, + max_addr, NUMA_NO_NODE); + if (alloc) + goto done; + } + + if (min_addr) { + min_addr = 0; + goto again; + } else { + goto error; + } + +done: + memblock_reserve(alloc, size); + ptr = phys_to_virt(alloc); + memset(ptr, 0, size); + + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. This is because many of these blocks + * are only referred via the physical address which is not + * looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); + + return ptr; + +error: + return NULL; +} + +/** + * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides + * additional debug information (including caller info), if enabled. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + return memblock_virt_alloc_internal(size, align, min_addr, + max_addr, nid); +} + +/** + * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * which provides debug information (including caller info), if enabled, + * and panics if the request can not be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + return ptr; + + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr); + return NULL; +} + +/** + * __memblock_free_early - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) +{ + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + __memblock_remove(&memblock.reserved, base, size); +} + +/* + * __memblock_free_late - free bootmem block pages directly to buddy allocator + * @addr: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are released directly + * to the buddy allocator, no bootmem metadata is updated because it is gone. + */ +void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + u64 cursor, end; + + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + cursor = PFN_UP(base); + end = PFN_DOWN(base + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} /* * Remaining API functions -- cgit 1.4.1 From 1c5e9c27cbd966c7f0038698d5dcd5ada3574f47 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:50:59 -0800 Subject: mm: numa: limit scope of lock for NUMA migrate rate limiting NUMA migrate rate limiting protects a migration counter and window using a lock but in some cases this can be a contended lock. It is not critical that the number of pages be perfect, lost updates are acceptable. Reduce the importance of this lock. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 +---- mm/migrate.c | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67ab5febabf7..5f2052c83154 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -764,10 +764,7 @@ typedef struct pglist_data { int kswapd_max_order; enum zone_type classzone_idx; #ifdef CONFIG_NUMA_BALANCING - /* - * Lock serializing the per destination node AutoNUMA memory - * migration rate limiting data. - */ + /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; /* Rate limiting time interval */ diff --git a/mm/migrate.c b/mm/migrate.c index 41eba21f10ba..4612bb2e3677 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1602,26 +1602,29 @@ bool migrate_ratelimited(int node) static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) { - bool rate_limited = false; - /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - spin_lock(&pgdat->numabalancing_migrate_lock); if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + spin_lock(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies + msecs_to_jiffies(migrate_interval_millisecs); + spin_unlock(&pgdat->numabalancing_migrate_lock); } if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) - rate_limited = true; - else - pgdat->numabalancing_migrate_nr_pages += nr_pages; - spin_unlock(&pgdat->numabalancing_migrate_lock); - - return rate_limited; + return true; + + /* + * This is an unlocked non-atomic update so errors are possible. + * The consequences are failing to migrate when we potentiall should + * have which is not severe enough to warrant locking. If it is ever + * a problem, it can be converted to a per-cpu counter. + */ + pgdat->numabalancing_migrate_nr_pages += nr_pages; + return false; } static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) -- cgit 1.4.1 From af1839d722c986ffeaae1e70a6ef1c75ff38dcd5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:01 -0800 Subject: mm: numa: trace tasks that fail migration due to rate limiting A low local/remote numa hinting fault ratio is potentially explained by failed migrations. This patch adds a tracepoint that fires when migration fails due to migration rate limitation. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/migrate.h | 26 ++++++++++++++++++++++++++ mm/migrate.c | 5 ++++- 2 files changed, 30 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index ec2a6ccfd7e5..3075ffbb9a83 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -45,6 +45,32 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->reason, MIGRATE_REASON)) ); +TRACE_EVENT(mm_numa_migrate_ratelimit, + + TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), + + TP_ARGS(p, dst_nid, nr_pages), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN) + __field( pid_t, pid) + __field( int, dst_nid) + __field( unsigned long, nr_pages) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->dst_nid = dst_nid; + __entry->nr_pages = nr_pages; + ), + + TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", + __entry->comm, + __entry->pid, + __entry->dst_nid, + __entry->nr_pages) +); #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/migrate.c b/mm/migrate.c index 4612bb2e3677..f9e16350d09c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1614,8 +1614,11 @@ static bool numamigrate_update_ratelimit(pg_data_t *pgdat, msecs_to_jiffies(migrate_interval_millisecs); spin_unlock(&pgdat->numabalancing_migrate_lock); } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { + trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, + nr_pages); return true; + } /* * This is an unlocked non-atomic update so errors are possible. -- cgit 1.4.1 From 286549dcaf4f128cb04f0ad56dfb677d7d19b500 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:03 -0800 Subject: sched: add tracepoints related to NUMA task migration This patch adds three tracepoints o trace_sched_move_numa when a task is moved to a node o trace_sched_swap_numa when a task is swapped with another task o trace_sched_stick_numa when a numa-related migration fails The tracepoints allow the NUMA scheduler activity to be monitored and the following high-level metrics can be calculated o NUMA migrated stuck nr trace_sched_stick_numa o NUMA migrated idle nr trace_sched_move_numa o NUMA migrated swapped nr trace_sched_swap_numa o NUMA local swapped trace_sched_swap_numa src_nid == dst_nid (should never happen) o NUMA remote swapped trace_sched_swap_numa src_nid != dst_nid (should == NUMA migrated swapped) o NUMA group swapped trace_sched_swap_numa src_ngid == dst_ngid Maybe a small number of these are acceptable but a high number would be a major surprise. It would be even worse if bounces are frequent. o NUMA avg task migs. Average number of migrations for tasks o NUMA stddev task mig Self-explanatory o NUMA max task migs. Maximum number of migrations for a single task In general the intent of the tracepoints is to help diagnose problems where automatic NUMA balancing appears to be doing an excessive amount of useless work. [akpm@linux-foundation.org: remove semicolon-after-if, repair coding-style] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/sched.h | 87 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 2 + kernel/sched/fair.c | 6 ++- 3 files changed, 94 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 04c308413a5d..67e1bbf83695 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -443,6 +443,93 @@ TRACE_EVENT(sched_process_hang, ); #endif /* CONFIG_DETECT_HUNG_TASK */ +DECLARE_EVENT_CLASS(sched_move_task_template, + + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu), + + TP_STRUCT__entry( + __field( pid_t, pid ) + __field( pid_t, tgid ) + __field( pid_t, ngid ) + __field( int, src_cpu ) + __field( int, src_nid ) + __field( int, dst_cpu ) + __field( int, dst_nid ) + ), + + TP_fast_assign( + __entry->pid = task_pid_nr(tsk); + __entry->tgid = task_tgid_nr(tsk); + __entry->ngid = task_numa_group_id(tsk); + __entry->src_cpu = src_cpu; + __entry->src_nid = cpu_to_node(src_cpu); + __entry->dst_cpu = dst_cpu; + __entry->dst_nid = cpu_to_node(dst_cpu); + ), + + TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", + __entry->pid, __entry->tgid, __entry->ngid, + __entry->src_cpu, __entry->src_nid, + __entry->dst_cpu, __entry->dst_nid) +); + +/* + * Tracks migration of tasks from one runqueue to another. Can be used to + * detect if automatic NUMA balancing is bouncing between nodes + */ +DEFINE_EVENT(sched_move_task_template, sched_move_numa, + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu) +); + +DEFINE_EVENT(sched_move_task_template, sched_stick_numa, + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu) +); + +TRACE_EVENT(sched_swap_numa, + + TP_PROTO(struct task_struct *src_tsk, int src_cpu, + struct task_struct *dst_tsk, int dst_cpu), + + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), + + TP_STRUCT__entry( + __field( pid_t, src_pid ) + __field( pid_t, src_tgid ) + __field( pid_t, src_ngid ) + __field( int, src_cpu ) + __field( int, src_nid ) + __field( pid_t, dst_pid ) + __field( pid_t, dst_tgid ) + __field( pid_t, dst_ngid ) + __field( int, dst_cpu ) + __field( int, dst_nid ) + ), + + TP_fast_assign( + __entry->src_pid = task_pid_nr(src_tsk); + __entry->src_tgid = task_tgid_nr(src_tsk); + __entry->src_ngid = task_numa_group_id(src_tsk); + __entry->src_cpu = src_cpu; + __entry->src_nid = cpu_to_node(src_cpu); + __entry->dst_pid = task_pid_nr(dst_tsk); + __entry->dst_tgid = task_tgid_nr(dst_tsk); + __entry->dst_ngid = task_numa_group_id(dst_tsk); + __entry->dst_cpu = dst_cpu; + __entry->dst_nid = cpu_to_node(dst_cpu); + ), + + TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", + __entry->src_pid, __entry->src_tgid, __entry->src_ngid, + __entry->src_cpu, __entry->src_nid, + __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, + __entry->dst_cpu, __entry->dst_nid) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36c951b7eef8..5ae36cc11fe5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) goto out; + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: @@ -4603,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b24b6cfde9aa..867b0a4b0893 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1250,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) p->numa_scan_period = task_scan_min(p); if (env.best_task == NULL) { - int ret = migrate_task_to(p, env.best_cpu); + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); return ret; } -- cgit 1.4.1 From 0eb927c0ab789d3d7d69f68acb850f69d4e7c36f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:05 -0800 Subject: mm: compaction: trace compaction begin and end The broad goal of the series is to improve allocation success rates for huge pages through memory compaction, while trying not to increase the compaction overhead. The original objective was to reintroduce capturing of high-order pages freed by the compaction, before they are split by concurrent activity. However, several bugs and opportunities for simple improvements were found in the current implementation, mostly through extra tracepoints (which are however too ugly for now to be considered for sending). The patches mostly deal with two mechanisms that reduce compaction overhead, which is caching the progress of migrate and free scanners, and marking pageblocks where isolation failed to be skipped during further scans. Patch 1 (from mgorman) adds tracepoints that allow calculate time spent in compaction and potentially debug scanner pfn values. Patch 2 encapsulates the some functionality for handling deferred compactions for better maintainability, without a functional change type is not determined without being actually needed. Patch 3 fixes a bug where cached scanner pfn's are sometimes reset only after they have been read to initialize a compaction run. Patch 4 fixes a bug where scanners meeting is sometimes not properly detected and can lead to multiple compaction attempts quitting early without doing any work. Patch 5 improves the chances of sync compaction to process pageblocks that async compaction has skipped due to being !MIGRATE_MOVABLE. Patch 6 improves the chances of sync direct compaction to actually do anything when called after async compaction fails during allocation slowpath. The impact of patches were validated using mmtests's stress-highalloc benchmark with mmtests's stress-highalloc benchmark on a x86_64 machine with 4GB memory. Due to instability of the results (mostly related to the bugs fixed by patches 2 and 3), 10 iterations were performed, taking min,mean,max values for success rates and mean values for time and vmstat-based metrics. First, the default GFP_HIGHUSER_MOVABLE allocations were tested with the patches stacked on top of v3.13-rc2. Patch 2 is OK to serve as baseline due to no functional changes in 1 and 2. Comments below. stress-highalloc 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp Success 1 Min 9.00 ( 0.00%) 10.00 (-11.11%) 43.00 (-377.78%) 43.00 (-377.78%) 33.00 (-266.67%) Success 1 Mean 27.50 ( 0.00%) 25.30 ( 8.00%) 45.50 (-65.45%) 45.90 (-66.91%) 46.30 (-68.36%) Success 1 Max 36.00 ( 0.00%) 36.00 ( 0.00%) 47.00 (-30.56%) 48.00 (-33.33%) 52.00 (-44.44%) Success 2 Min 10.00 ( 0.00%) 8.00 ( 20.00%) 46.00 (-360.00%) 45.00 (-350.00%) 35.00 (-250.00%) Success 2 Mean 26.40 ( 0.00%) 23.50 ( 10.98%) 47.30 (-79.17%) 47.60 (-80.30%) 48.10 (-82.20%) Success 2 Max 34.00 ( 0.00%) 33.00 ( 2.94%) 48.00 (-41.18%) 50.00 (-47.06%) 54.00 (-58.82%) Success 3 Min 65.00 ( 0.00%) 63.00 ( 3.08%) 85.00 (-30.77%) 84.00 (-29.23%) 85.00 (-30.77%) Success 3 Mean 76.70 ( 0.00%) 70.50 ( 8.08%) 86.20 (-12.39%) 85.50 (-11.47%) 86.00 (-12.13%) Success 3 Max 87.00 ( 0.00%) 86.00 ( 1.15%) 88.00 ( -1.15%) 87.00 ( 0.00%) 87.00 ( 0.00%) 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp User 6437.72 6459.76 5960.32 5974.55 6019.67 System 1049.65 1049.09 1029.32 1031.47 1032.31 Elapsed 1856.77 1874.48 1949.97 1994.22 1983.15 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp Minor Faults 253952267 254581900 250030122 250507333 250157829 Major Faults 420 407 506 530 530 Swap Ins 4 9 9 6 6 Swap Outs 398 375 345 346 333 Direct pages scanned 197538 189017 298574 287019 299063 Kswapd pages scanned 1809843 1801308 1846674 1873184 1861089 Kswapd pages reclaimed 1806972 1798684 1844219 1870509 1858622 Direct pages reclaimed 197227 188829 298380 286822 298835 Kswapd efficiency 99% 99% 99% 99% 99% Kswapd velocity 953.382 970.449 952.243 934.569 922.286 Direct efficiency 99% 99% 99% 99% 99% Direct velocity 104.058 101.832 153.961 143.200 148.205 Percentage direct scans 9% 9% 13% 13% 13% Zone normal velocity 347.289 359.676 348.063 339.933 332.983 Zone dma32 velocity 710.151 712.605 758.140 737.835 737.507 Zone dma velocity 0.000 0.000 0.000 0.000 0.000 Page writes by reclaim 557.600 429.000 353.600 426.400 381.800 Page writes file 159 53 7 79 48 Page writes anon 398 375 345 346 333 Page reclaim immediate 825 644 411 575 420 Sector Reads 2781750 2769780 2878547 2939128 2910483 Sector Writes 12080843 12083351 12012892 12002132 12010745 Page rescued immediate 0 0 0 0 0 Slabs scanned 1575654 1545344 1778406 1786700 1794073 Direct inode steals 9657 10037 15795 14104 14645 Kswapd inode steals 46857 46335 50543 50716 51796 Kswapd skipped wait 0 0 0 0 0 THP fault alloc 97 91 81 71 77 THP collapse alloc 456 506 546 544 565 THP splits 6 5 5 4 4 THP fault fallback 0 1 0 0 0 THP collapse fail 14 14 12 13 12 Compaction stalls 1006 980 1537 1536 1548 Compaction success 303 284 562 559 578 Compaction failures 702 696 974 976 969 Page migrate success 1177325 1070077 3927538 3781870 3877057 Page migrate failure 0 0 0 0 0 Compaction pages isolated 2547248 2306457 8301218 8008500 8200674 Compaction migrate scanned 42290478 38832618 153961130 154143900 159141197 Compaction free scanned 89199429 79189151 356529027 351943166 356326727 Compaction cost 1566 1426 5312 5156 5294 NUMA PTE updates 0 0 0 0 0 NUMA hint faults 0 0 0 0 0 NUMA hint local faults 0 0 0 0 0 NUMA hint local percent 100 100 100 100 100 NUMA pages migrated 0 0 0 0 0 AutoNUMA cost 0 0 0 0 0 Observations: - The "Success 3" line is allocation success rate with system idle (phases 1 and 2 are with background interference). I used to get stable values around 85% with vanilla 3.11. The lower min and mean values came with 3.12. This was bisected to commit 81c0a2bb ("mm: page_alloc: fair zone allocator policy") As explained in comment for patch 3, I don't think the commit is wrong, but that it makes the effect of compaction bugs worse. From patch 3 onwards, the results are OK and match the 3.11 results. - Patch 4 also clearly helps phases 1 and 2, and exceeds any results I've seen with 3.11 (I didn't measure it that thoroughly then, but it was never above 40%). - Compaction cost and number of scanned pages is higher, especially due to patch 4. However, keep in mind that patches 3 and 4 fix existing bugs in the current design of compaction overhead mitigation, they do not change it. If overhead is found unacceptable, then it should be decreased differently (and consistently, not due to random conditions) than the current implementation does. In contrast, patches 5 and 6 (which are not strictly bug fixes) do not increase the overhead (but also not success rates). This might be a limitation of the stress-highalloc benchmark as it's quite uniform. Another set of results is when configuring stress-highalloc t allocate with similar flags as THP uses: (GFP_HIGHUSER_MOVABLE|__GFP_NOMEMALLOC|__GFP_NORETRY|__GFP_NO_KSWAPD) stress-highalloc 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp Success 1 Min 2.00 ( 0.00%) 7.00 (-250.00%) 18.00 (-800.00%) 19.00 (-850.00%) 26.00 (-1200.00%) Success 1 Mean 19.20 ( 0.00%) 17.80 ( 7.29%) 29.20 (-52.08%) 29.90 (-55.73%) 32.80 (-70.83%) Success 1 Max 27.00 ( 0.00%) 29.00 ( -7.41%) 35.00 (-29.63%) 36.00 (-33.33%) 37.00 (-37.04%) Success 2 Min 3.00 ( 0.00%) 8.00 (-166.67%) 21.00 (-600.00%) 21.00 (-600.00%) 32.00 (-966.67%) Success 2 Mean 19.30 ( 0.00%) 17.90 ( 7.25%) 32.20 (-66.84%) 32.60 (-68.91%) 35.70 (-84.97%) Success 2 Max 27.00 ( 0.00%) 30.00 (-11.11%) 36.00 (-33.33%) 37.00 (-37.04%) 39.00 (-44.44%) Success 3 Min 62.00 ( 0.00%) 62.00 ( 0.00%) 85.00 (-37.10%) 75.00 (-20.97%) 64.00 ( -3.23%) Success 3 Mean 66.30 ( 0.00%) 65.50 ( 1.21%) 85.60 (-29.11%) 83.40 (-25.79%) 83.50 (-25.94%) Success 3 Max 70.00 ( 0.00%) 69.00 ( 1.43%) 87.00 (-24.29%) 86.00 (-22.86%) 87.00 (-24.29%) 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp User 6547.93 6475.85 6265.54 6289.46 6189.96 System 1053.42 1047.28 1043.23 1042.73 1038.73 Elapsed 1835.43 1821.96 1908.67 1912.74 1956.38 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp Minor Faults 256805673 253106328 253222299 249830289 251184418 Major Faults 395 375 423 434 448 Swap Ins 12 10 10 12 9 Swap Outs 530 537 487 455 415 Direct pages scanned 71859 86046 153244 152764 190713 Kswapd pages scanned 1900994 1870240 1898012 1892864 1880520 Kswapd pages reclaimed 1897814 1867428 1894939 1890125 1877924 Direct pages reclaimed 71766 85908 153167 152643 190600 Kswapd efficiency 99% 99% 99% 99% 99% Kswapd velocity 1029.000 1067.782 1000.091 991.049 951.218 Direct efficiency 99% 99% 99% 99% 99% Direct velocity 38.897 49.127 80.747 79.983 96.468 Percentage direct scans 3% 4% 7% 7% 9% Zone normal velocity 351.377 372.494 348.910 341.689 335.310 Zone dma32 velocity 716.520 744.414 731.928 729.343 712.377 Zone dma velocity 0.000 0.000 0.000 0.000 0.000 Page writes by reclaim 669.300 604.000 545.700 538.900 429.900 Page writes file 138 66 58 83 14 Page writes anon 530 537 487 455 415 Page reclaim immediate 806 655 772 548 517 Sector Reads 2711956 2703239 2811602 2818248 2839459 Sector Writes 12163238 12018662 12038248 11954736 11994892 Page rescued immediate 0 0 0 0 0 Slabs scanned 1385088 1388364 1507968 1513292 1558656 Direct inode steals 1739 2564 4622 5496 6007 Kswapd inode steals 47461 46406 47804 48013 48466 Kswapd skipped wait 0 0 0 0 0 THP fault alloc 110 82 84 69 70 THP collapse alloc 445 482 467 462 539 THP splits 6 5 4 5 3 THP fault fallback 3 0 0 0 0 THP collapse fail 15 14 14 14 13 Compaction stalls 659 685 1033 1073 1111 Compaction success 222 225 410 427 456 Compaction failures 436 460 622 646 655 Page migrate success 446594 439978 1085640 1095062 1131716 Page migrate failure 0 0 0 0 0 Compaction pages isolated 1029475 1013490 2453074 2482698 2565400 Compaction migrate scanned 9955461 11344259 24375202 27978356 30494204 Compaction free scanned 27715272 28544654 80150615 82898631 85756132 Compaction cost 552 555 1344 1379 1436 NUMA PTE updates 0 0 0 0 0 NUMA hint faults 0 0 0 0 0 NUMA hint local faults 0 0 0 0 0 NUMA hint local percent 100 100 100 100 100 NUMA pages migrated 0 0 0 0 0 AutoNUMA cost 0 0 0 0 0 There are some differences from the previous results for THP-like allocations: - Here, the bad result for unpatched kernel in phase 3 is much more consistent to be between 65-70% and not related to the "regression" in 3.12. Still there is the improvement from patch 4 onwards, which brings it on par with simple GFP_HIGHUSER_MOVABLE allocations. - Compaction costs have increased, but nowhere near as much as the non-THP case. Again, the patches should be worth the gained determininsm. - Patches 5 and 6 somewhat increase the number of migrate-scanned pages. This is most likely due to __GFP_NO_KSWAPD flag, which means the cached pfn's and pageblock skip bits are not reset by kswapd that often (at least in phase 3 where no concurrent activity would wake up kswapd) and the patches thus help the sync-after-async compaction. It doesn't however show that the sync compaction would help so much with success rates, which can be again seen as a limitation of the benchmark scenario. This patch (of 6): Add two tracepoints for compaction begin and end of a zone. Using this it is possible to calculate how much time a workload is spending within compaction and potentially debug problems related to cached pfns for scanning. In combination with the direct reclaim and slab trace points it should be possible to estimate most allocation-related overhead for a workload. Signed-off-by: Mel Gorman Signed-off-by: Vlastimil Babka Cc: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 42 +++++++++++++++++++++++++++++++++++++++ mm/compaction.c | 4 ++++ 2 files changed, 46 insertions(+) (limited to 'include') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index fde1b3e94c7d..06f544ef2f6f 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -67,6 +67,48 @@ TRACE_EVENT(mm_compaction_migratepages, __entry->nr_failed) ); +TRACE_EVENT(mm_compaction_begin, + TP_PROTO(unsigned long zone_start, unsigned long migrate_start, + unsigned long free_start, unsigned long zone_end), + + TP_ARGS(zone_start, migrate_start, free_start, zone_end), + + TP_STRUCT__entry( + __field(unsigned long, zone_start) + __field(unsigned long, migrate_start) + __field(unsigned long, free_start) + __field(unsigned long, zone_end) + ), + + TP_fast_assign( + __entry->zone_start = zone_start; + __entry->migrate_start = migrate_start; + __entry->free_start = free_start; + __entry->zone_end = zone_end; + ), + + TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", + __entry->zone_start, + __entry->migrate_start, + __entry->free_start, + __entry->zone_end) +); + +TRACE_EVENT(mm_compaction_end, + TP_PROTO(int status), + + TP_ARGS(status), + + TP_STRUCT__entry( + __field(int, status) + ), + + TP_fast_assign( + __entry->status = status; + ), + + TP_printk("status=%d", __entry->status) +); #endif /* _TRACE_COMPACTION_H */ diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd016f43..a03995eddedb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -970,6 +970,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) __reset_isolation_suitable(zone); + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); + migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { @@ -1015,6 +1017,8 @@ out: cc->nr_freepages -= release_freepages(&cc->freepages); VM_BUG_ON(cc->nr_freepages != 0); + trace_mm_compaction_end(ret); + return ret; } -- cgit 1.4.1 From de6c60a6c115acaa721cfd499e028a413d1fcbf3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:07 -0800 Subject: mm: compaction: encapsulate defer reset logic Currently there are several functions to manipulate the deferred compaction state variables. The remaining case where the variables are touched directly is when a successful allocation occurs in direct compaction, or is expected to be successful in the future by kswapd. Here, the lowest order that is expected to fail is updated, and in the case of successful allocation, the deferred status and counter is reset completely. Create a new function compaction_defer_reset() to encapsulate this functionality and make it easier to understand the code. No functional change. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 16 ++++++++++++++++ mm/compaction.c | 9 ++++----- mm/page_alloc.c | 5 +---- 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 091d72e70d8a..7e1c76e3cd68 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order) return zone->compact_considered < defer_limit; } +/* + * Update defer tracking counters after successful compaction of given order, + * which means an allocation either succeeded (alloc_success == true) or is + * expected to succeed. + */ +static inline void compaction_defer_reset(struct zone *zone, int order, + bool alloc_success) +{ + if (alloc_success) { + zone->compact_considered = 0; + zone->compact_defer_shift = 0; + } + if (order >= zone->compact_order_failed) + zone->compact_order_failed = order + 1; +} + /* Returns true if restarting compaction after many failures */ static inline bool compaction_restarting(struct zone *zone, int order) { diff --git a/mm/compaction.c b/mm/compaction.c index a03995eddedb..927de97cab8d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1124,12 +1124,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) compact_zone(zone, cc); if (cc->order > 0) { - int ok = zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0); - if (ok && cc->order >= zone->compact_order_failed) - zone->compact_order_failed = cc->order + 1; + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); /* Currently async compaction is never deferred. */ - else if (!ok && cc->sync) + else if (cc->sync) defer_compaction(zone, cc->order); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b230e838883d..84da0e3bc886 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2235,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; - preferred_zone->compact_considered = 0; - preferred_zone->compact_defer_shift = 0; - if (order >= preferred_zone->compact_order_failed) - preferred_zone->compact_order_failed = order + 1; + compaction_defer_reset(preferred_zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } -- cgit 1.4.1 From 59c82b70dcd9cc273c21fae5abc29e41fc732a17 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:17 -0800 Subject: mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages Some part of putback_lru_pages() and putback_movable_pages() is duplicated, so it could confuse us what we should use. We can remove putback_lru_pages() since it is not really needed now. This makes us undestand and maintain the code more easily. And comment on putback_movable_pages() is stale now, so fix it. Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 -- mm/memory-failure.c | 8 +++++++- mm/migrate.c | 29 +++++++++-------------------- 3 files changed, 16 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f015c059e159..2a411bc0097f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -35,7 +35,6 @@ enum migrate_reason { #ifdef CONFIG_MIGRATION -extern void putback_lru_pages(struct list_head *l); extern void putback_movable_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); @@ -59,7 +58,6 @@ extern int migrate_page_move_mapping(struct address_space *mapping, int extra_count); #else -static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9fa6586d5275..b25ed321e667 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - putback_lru_pages(&pagelist); + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/migrate.c b/mm/migrate.c index 13bedcc4656b..8a73d66be102 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -71,29 +71,13 @@ int migrate_prep_local(void) return 0; } -/* - * Add isolated pages on the list back to the LRU under page lock - * to avoid leaking evictable pages back onto unevictable list. - */ -void putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - - list_for_each_entry_safe(page, page2, l, lru) { - list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - putback_lru_page(page); - } -} - /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. * - * This function shall be used instead of putback_lru_pages(), - * whenever the isolated pageset has been built by isolate_migratepages_range() + * This function shall be used whenever the isolated pageset has been + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() + * and isolate_huge_page(). */ void putback_movable_pages(struct list_head *l) { @@ -1725,7 +1709,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { - putback_lru_pages(&migratepages); + if (!list_empty(&migratepages)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } isolated = 0; } else count_vm_numa_event(NUMA_PAGE_MIGRATE); -- cgit 1.4.1 From 78d5506e82b21a1a1de68c24182db2c2fe521422 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:18 -0800 Subject: mm/migrate: remove unused function, fail_migrate_page() fail_migrate_page() isn't used anywhere, so remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Reviewed-by: Naoya Horiguchi Reviewed-by: Wanpeng Li Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 4 ---- mm/migrate.c | 8 -------- 2 files changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 2a411bc0097f..84a31ad0b791 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,9 +41,6 @@ extern int migrate_page(struct address_space *, extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason); -extern int fail_migrate_page(struct address_space *, - struct page *, struct page *); - extern int migrate_prep(void); extern int migrate_prep_local(void); extern int migrate_vmas(struct mm_struct *mm, @@ -84,7 +81,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, /* Possible settings for the migrate_page() method in address_operations */ #define migrate_page NULL -#define fail_migrate_page NULL #endif /* CONFIG_MIGRATION */ diff --git a/mm/migrate.c b/mm/migrate.c index 8a73d66be102..a8025befc323 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -552,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) * Migration functions ***********************************************************/ -/* Always fail migration. Used for mappings that are not movable */ -int fail_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - /* * Common logic to directly migrate a single page suitable for * pages that do not use PagePrivate/PagePrivate2. -- cgit 1.4.1