From c32e6e80900a80fbaec23119f854d5e14a47e80b Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Sat, 15 Jul 2017 11:41:14 +1000 Subject: mm: skip HWPoisoned pages when onlining pages b023f46813cd ("memory-hotplug: skip HWPoisoned page when offlining pages") skipped the HWPoisoned pages when offlining pages, but this should be skipped when onlining the pages too. n-horiguchi@ah.jp.nec.com said: : If I read correctly, to "skip HWPoiosned page" in commit b023f46813cd : means that we skip the page status check for hwpoisoned pages *not* to : prevent memory offlining for memblocks with hwpoisoned pages. That : means that hwpoisoned pages can be offlined. : : And another reason to clear PageReserved is that we could reuse the : hwpoisoned page after onlining back with replacing the broken DIMM. In : this usecase, we first do unpoisoning to clear PageHWPoison, but it : doesn't work if PageReserved is set. My simple testing shows the BUG : below in unpoisoning (without the ClearPageReserved): : : Unpoison: Software-unpoisoned page 0x18000 : BUG: Bad page state in process page-types pfn:18000 : page:ffffda5440600000 count:0 mapcount:0 mapping: (null) index:0x70006b599 : flags: 0x1fffc00004081a(error|uptodate|dirty|reserved|swapbacked) : raw: 001fffc00004081a 0000000000000000 000000070006b599 00000000ffffffff : raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000 : page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set : bad because of flags: 0x800(reserved) Link: http://lkml.kernel.org/r/1493130472-22843-3-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Cc: Naoya Horiguchi Cc: Andrey Vagin Cc: Glauber Costa Cc: Vladimir Davydov Cc: Balbir Singh Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8dccc317aac2..d620d0427b6b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -689,6 +689,10 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); + if (PageHWPoison(page)) { + ClearPageReserved(page); + continue; + } (*online_page_callback)(page); onlined_pages++; } -- cgit v1.2.3 From 82494ce0a87a3f8d9a3d77e1fdaa9f34a5128633 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 15 Jul 2017 11:41:15 +1000 Subject: arm: arch/arm/include/asm/page.h needs personality.h VM_DATA_DEFAULT_FLAGS uses READ_IMPLIES_EXEC, so page.h should include personality.h to provide this. This fixes no known bugs and can be safely ignored ;) Cc: Russell King Cc: Will Deacon Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton --- arch/arm/include/asm/page.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include -- cgit v1.2.3 From 947e780b273e35dac08166122ca91fea6f6d4d96 Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 15 Jul 2017 11:41:15 +1000 Subject: ocfs2: remove ocfs2_is_o2cb_active() Remove ocfs2_is_o2cb_active(). We have similar functions to identify which cluster stack is being used via osb->osb_cluster_stack. Secondly, the current implementation of ocfs2_is_o2cb_active() is not totally safe. Based on the design of stackglue, we need to get ocfs2_stack_lock before using ocfs2_stack related data structures, and that active_stack pointer can be NULL in the case of mount failure. Link: http://lkml.kernel.org/r/1495441079-11708-1-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Reviewed-by: Eric Ren Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton --- fs/ocfs2/dlmglue.c | 2 +- fs/ocfs2/stackglue.c | 6 ------ fs/ocfs2/stackglue.h | 3 --- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4689940a953c..703d8acc5efb 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3413,7 +3413,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * we can recover correctly from node failure. Otherwise, we may get * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. */ - if (!ocfs2_is_o2cb_active() && + if (ocfs2_userspace_stack(osb) && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) lvb = 1; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index d6c350ba25b9..c4b029c43464 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; -inline int ocfs2_is_o2cb_active(void) -{ - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); -} -EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); - static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e3036e1790e8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); -/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ -int ocfs2_is_o2cb_active(void); - extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ -- cgit v1.2.3 From 555c056945f7221518a2faf3627e50ecc4b78412 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Sat, 15 Jul 2017 11:41:15 +1000 Subject: ocfs2: old mle put and release after the function dlm_add_migration_mle called If the old mle is found after the dlm_add_migration_mle called, it should be put once. If the return value is not - EEXIST and its type is BLOCK, it should be put again to release it to avoid memory leak, for it had been unhashed from the map. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4A3D4B7FE@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: Guozhonghua Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmmaster.c | 62 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..4438671c4ac3 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2612,20 +2612,45 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + if (ret == -EEXIST) { + if(oldmle) + __dlm_put_mle(oldmle); + + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + mlog(0, "another process is already migrating it\n"); + goto fail; + } + + /* If an old one mle found, it should be put. if its type is BLOCK, + * it should be put again. Because it had been unhasded from the map + * in the function dlm_add_migration_mle. + * otherwise the memory will be leaked. It will not found it again from + * the hash map. + */ + if (oldmle) { + /* master is known, detach if not already detached */ + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + + /* if the type of the mle is BLOCK, should put it once for release. + * otherwise memory leak may be caused because oldmle had been unhashed + * from the hash map, it will not be found anymore. + */ + if (oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + /* get an extra reference on the mle. * otherwise the assert_master from the new * master will destroy this. */ dlm_get_mle_inuse(mle); + mle_added = 1; + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); - if (ret == -EEXIST) { - mlog(0, "another process is already migrating it\n"); - goto fail; - } - mle_added = 1; - /* * set the MIGRATING flag and flush asts * if we fail after this we need to re-dirty the lockres @@ -2642,12 +2667,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (ret != -EEXIST && oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (ret < 0) { if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); @@ -3182,16 +3201,23 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, if (ret < 0) kmem_cache_free(dlm_mle_cache, mle); + /* If an old one mle found, it should be put. if its type is BLOCK, + * it should be put again. Because it had been unhasded from the map + * in the function dlm_add_migration_mle. + * otherwise the memory will be leaked. It will not found it again from + * the hash map. + */ + if (oldmle) { + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); - if (oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (res) dlm_lockres_put(res); leave: -- cgit v1.2.3 From 2d060f378b31d7ea521df9b652650fc7996ef302 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 15 Jul 2017 11:41:15 +1000 Subject: ocfs2-old-mle-put-and-release-after-the-function-dlm_add_migration_mle-called-fix fix coding style, comments Cc: Guozhonghua Cc: Joel Becker Cc: Joseph Qi Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmmaster.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4438671c4ac3..f0072145eead 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2613,7 +2613,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); if (ret == -EEXIST) { - if(oldmle) + if (oldmle) __dlm_put_mle(oldmle); spin_unlock(&dlm->master_lock); @@ -2622,10 +2622,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, goto fail; } - /* If an old one mle found, it should be put. if its type is BLOCK, - * it should be put again. Because it had been unhasded from the map + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again. Because it has been unhasded from the map * in the function dlm_add_migration_mle. - * otherwise the memory will be leaked. It will not found it again from + * Otherwise the memory will be leaked. It will not be found again from * the hash map. */ if (oldmle) { @@ -2633,9 +2634,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, __dlm_mle_detach_hb_events(dlm, oldmle); __dlm_put_mle(oldmle); - /* if the type of the mle is BLOCK, should put it once for release. - * otherwise memory leak may be caused because oldmle had been unhashed - * from the hash map, it will not be found anymore. + /* + * If the type of the mle is BLOCK, it should be put once for + * release. Otherwise a memory leak may be caused because + * oldmle has been unhashed from the hash map and it will not + * be found any more. */ if (oldmle->type == DLM_MLE_BLOCK) __dlm_put_mle(oldmle); @@ -3201,10 +3204,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, if (ret < 0) kmem_cache_free(dlm_mle_cache, mle); - /* If an old one mle found, it should be put. if its type is BLOCK, - * it should be put again. Because it had been unhasded from the map - * in the function dlm_add_migration_mle. - * otherwise the memory will be leaked. It will not found it again from + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again because it has been unhashed from the map + * in the dlm_add_migration_mle(). + * Otherwise the memory will be leaked. It will not be found again from * the hash map. */ if (oldmle) { -- cgit v1.2.3 From d82442ed243cfff2860ce6938067cb8f0a83bc32 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Sat, 15 Jul 2017 11:41:15 +1000 Subject: ocfs2/dlm: optimize freeing of dead node locks Three loops can be optimized into one and its sub loops, so less code can do the same work. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4C4AF898E@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: Guozhonghua Reviewed-by: Eric Ren Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmrecovery.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..4c4b18e612c5 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct dlm_lock *lock, *next; unsigned int freed = 0; + struct list_head *queue = NULL; + int i; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2280,31 +2282,18 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ /* TODO: check pending_asts, pending_basts here */ - list_for_each_entry_safe(lock, next, &res->granted, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->converting, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->blocked, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK + * do manually + */ + dlm_lock_put(lock); + freed++; + } } } -- cgit v1.2.3 From 341d2d6f0f30256cff2c323b3e0848e4f0f10755 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 15 Jul 2017 11:41:16 +1000 Subject: ocfs2-dlm-optimization-of-code-while-free-dead-node-locks-checkpatch-fixes WARNING: please, no spaces at the start of a line #26: FILE: fs/ocfs2/dlm/dlmrecovery.c:2271: + struct list_head *queue = NULL;$ WARNING: please, no spaces at the start of a line #27: FILE: fs/ocfs2/dlm/dlmrecovery.c:2272: + int i;$ WARNING: please, no spaces at the start of a line #60: FILE: fs/ocfs2/dlm/dlmrecovery.c:2285: + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {$ WARNING: suspect code indent for conditional statements (7, 15) #60: FILE: fs/ocfs2/dlm/dlmrecovery.c:2285: + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); ERROR: code indent should use tabs where possible #61: FILE: fs/ocfs2/dlm/dlmrecovery.c:2286: + queue = dlm_list_idx_to_ptr(res, i);$ WARNING: please, no spaces at the start of a line #61: FILE: fs/ocfs2/dlm/dlmrecovery.c:2286: + queue = dlm_list_idx_to_ptr(res, i);$ ERROR: code indent should use tabs where possible #62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287: + list_for_each_entry_safe(lock, next, queue, list) {$ WARNING: please, no spaces at the start of a line #62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287: + list_for_each_entry_safe(lock, next, queue, list) {$ WARNING: suspect code indent for conditional statements (15, 23) #62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287: + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { ERROR: code indent should use tabs where possible #63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288: + if (lock->ml.node == dead_node) {$ WARNING: please, no spaces at the start of a line #63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288: + if (lock->ml.node == dead_node) {$ WARNING: suspect code indent for conditional statements (23, 31) #63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288: + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); ERROR: code indent should use tabs where possible #64: FILE: fs/ocfs2/dlm/dlmrecovery.c:2289: + list_del_init(&lock->list);$ WARNING: please, no spaces at the start of a line #64: FILE: fs/ocfs2/dlm/dlmrecovery.c:2289: + list_del_init(&lock->list);$ ERROR: code indent should use tabs where possible #65: FILE: fs/ocfs2/dlm/dlmrecovery.c:2290: + dlm_lock_put(lock);$ WARNING: please, no spaces at the start of a line #65: FILE: fs/ocfs2/dlm/dlmrecovery.c:2290: + dlm_lock_put(lock);$ ERROR: code indent should use tabs where possible #66: FILE: fs/ocfs2/dlm/dlmrecovery.c:2291: + /* Can't schedule DLM_UNLOCK_FREE_LOCK$ ERROR: code indent should use tabs where possible #67: FILE: fs/ocfs2/dlm/dlmrecovery.c:2292: + * do manually$ ERROR: code indent should use tabs where possible #68: FILE: fs/ocfs2/dlm/dlmrecovery.c:2293: + */$ ERROR: code indent should use tabs where possible #69: FILE: fs/ocfs2/dlm/dlmrecovery.c:2294: + dlm_lock_put(lock);$ WARNING: please, no spaces at the start of a line #69: FILE: fs/ocfs2/dlm/dlmrecovery.c:2294: + dlm_lock_put(lock);$ ERROR: code indent should use tabs where possible #70: FILE: fs/ocfs2/dlm/dlmrecovery.c:2295: + freed++;$ WARNING: please, no spaces at the start of a line #70: FILE: fs/ocfs2/dlm/dlmrecovery.c:2295: + freed++;$ ERROR: code indent should use tabs where possible #71: FILE: fs/ocfs2/dlm/dlmrecovery.c:2296: + }$ WARNING: please, no spaces at the start of a line #71: FILE: fs/ocfs2/dlm/dlmrecovery.c:2296: + }$ total: 11 errors, 14 warnings, 51 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. NOTE: Whitespace errors detected. You may wish to use scripts/cleanpatch or scripts/cleanfile ./patches/ocfs2-dlm-optimization-of-code-while-free-dead-node-locks.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Guozhonghua Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmrecovery.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 4c4b18e612c5..908b05942282 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2268,8 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct dlm_lock *lock, *next; unsigned int freed = 0; - struct list_head *queue = NULL; - int i; + struct list_head *queue = NULL; + int i; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2282,18 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ /* TODO: check pending_asts, pending_basts here */ - for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { - queue = dlm_list_idx_to_ptr(res, i); - list_for_each_entry_safe(lock, next, queue, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - * do manually - */ - dlm_lock_put(lock); - freed++; - } + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* + * Can't schedule DLM_UNLOCK_FREE_LOCK: do + * manually + */ + dlm_lock_put(lock); + freed++; + } } } -- cgit v1.2.3 From 32d70be6fe37b958f83e4f5f153b2fc696c6092c Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 15 Jul 2017 11:41:16 +1000 Subject: ocfs2: give an obvious tip for mismatched cluster names Add an obvious error message, due to mismatched cluster names between on-disk and in the current cluster. We can meet this case during OCFS2 cluster migration. If we can give the user an obvious tip for why they can not mount the file system after migration, they can quickly fix this mismatch problem. Second, also move printing ocfs2_fill_super() errno to the front of ocfs2_dismount_volume(), since ocfs2_dismount_volume() will also print its own message. I looked through all the code of OCFS2 (include o2cb); there is not any place which returns this error. In fact, the function calling path ocfs2_fill_super -> ocfs2_mount_volume -> ocfs2_dlm_init -> dlm_new_lockspace is a very specific one. We can use this errno to give the user a more clear tip, since this case is a little common during cluster migration, but the customer can quickly get the failure cause if there is a error printed. Also, I think it is not possible to add this errno in the o2cb path during ocfs2_dlm_init(), since the o2cb code has been stable for a long time. We only print this error tip when the user uses pcmk stack, since using the o2cb stack the user will not meet this error. Link: http://lkml.kernel.org/r/1495089336-19312-1-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83005f486451..24d885bf14bc 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1208,14 +1208,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) read_super_error: brelse(bh); + if (status) + mlog_errno(status); + if (osb) { atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); ocfs2_dismount_volume(sb, 1); } - if (status) - mlog_errno(status); return status; } @@ -1843,6 +1844,9 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); + if (status == -EBADR) + mlog(ML_ERROR, "couldn't mount because cluster name on" + " disk does not match the running cluster name.\n"); goto leave; } -- cgit v1.2.3 From 7845af6f476326109bde39a77fd819930faac7e2 Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 15 Jul 2017 11:41:16 +1000 Subject: ocfs2-give-an-obvious-tip-for-dismatch-cluster-names-v2 Compare with initial version, we only print this error tip when the user uses pcmk stack. since in o2cb stack, the user will not meet this error. Link: http://lkml.kernel.org/r/1495419305-3780-1-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 24d885bf14bc..f03fdd0fc72c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1844,7 +1844,7 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); - if (status == -EBADR) + if (status == -EBADR && ocfs2_userspace_stack(osb)) mlog(ML_ERROR, "couldn't mount because cluster name on" " disk does not match the running cluster name.\n"); goto leave; -- cgit v1.2.3 From 9e6f2ab0e30d48c84d92d496ba9512f9f670741a Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 15 Jul 2017 11:41:16 +1000 Subject: ocfs2: move some definitions to header file Patch series "ocfs2: use kobject for online file check", v3. Use embedded kobject mechanism for online file check feature, this will avoid to use a global list to save/search per-device online file check related data. The changed code is based on Goldwyn Rodrigues's patches and ext4 fs code, there is not any new features added, except some very small fixes during this code refactoring. Second, the code change does not affect the underlying file check code. Thank Goldwyn very much. Compare with second version, add more comments in the patch descriptions, to make sure each modification is mentioned. Compare with first version, split the code change into four patches, make sure each patch will not bring ocfs2 kernel modules compiling errors. This patch (of 3): Move some definitions to header file, which will be referenced by other source files when kobject mechanism is introduced. Link: http://lkml.kernel.org/r/1495611866-27360-2-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/filecheck.c | 27 --------------------------- fs/ocfs2/filecheck.h | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 2cabbcf2f28e..cc7b595e264d 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -56,33 +56,6 @@ static const char * const ocfs2_filecheck_errs[] = { static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); static LIST_HEAD(ocfs2_filecheck_sysfs_list); -struct ocfs2_filecheck { - struct list_head fc_head; /* File check entry list head */ - spinlock_t fc_lock; - unsigned int fc_max; /* Maximum number of entry in list */ - unsigned int fc_size; /* Current entry count in list */ - unsigned int fc_done; /* Finished entry count in list */ -}; - -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - -#define OCFS2_FILECHECK_MAXSIZE 100 -#define OCFS2_FILECHECK_MINSIZE 10 - -/* File check operation type */ -enum { - OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ - OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ - OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ -}; - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index e5cd002a2c09..af1678b620a4 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -43,6 +43,33 @@ enum { #define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED #define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ + struct list_head fs_list; + atomic_t fs_count; + struct super_block *fs_sb; + struct kset *fs_devicekset; + struct kset *fs_fcheckkset; + struct ocfs2_filecheck *fs_fcheck; +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + int ocfs2_filecheck_create_sysfs(struct super_block *sb); int ocfs2_filecheck_remove_sysfs(struct super_block *sb); -- cgit v1.2.3 From a92260d973ae172e58d6408a90bceb9501dc64c3 Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 15 Jul 2017 11:41:16 +1000 Subject: ocfs2: fix some small problems First, move setting fe_done = 1 in spin lock, avoid bring any potential race condition. Second, tune mlog message level from ERROR to NOTICE, since the message should not belong to error message. Third, tune errno to -EAGAIN when file check queue is full, this errno is more appropriate in the case. Link: http://lkml.kernel.org/r/1495611866-27360-3-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/filecheck.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index cc7b595e264d..43477272ab96 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -288,7 +288,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, spin_lock(&ent->fs_fcheck->fc_lock); if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot set online file check maximum entry number " "to %u due to too many pending entries(%u)\n", len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); @@ -462,8 +462,8 @@ static void ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { - entry->fe_done = 1; spin_lock(&ent->fs_fcheck->fc_lock); + entry->fe_done = 1; ent->fs_fcheck->fc_done++; spin_unlock(&ent->fs_fcheck->fc_lock); } @@ -545,11 +545,11 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, spin_lock(&ent->fs_fcheck->fc_lock); if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && (ent->fs_fcheck->fc_done == 0)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", ent->fs_fcheck->fc_max); - ret = -EBUSY; + ret = -EAGAIN; kfree(entry); } else { if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && -- cgit v1.2.3 From ef4d7619dbc2fd2cba76db700e1958d5a2e3ae1c Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 15 Jul 2017 11:41:16 +1000 Subject: ocfs2: add kobject for online file check Use embedded kobject mechanism for online file check feature, this will avoid to use a global list to save/search per-device online file check related data, meanwhile, reduce the code lines and make the code logic clear. The changed code is based on Goldwyn Rodrigues's patches and ext4 fs code. Link: http://lkml.kernel.org/r/1495611866-27360-4-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/filecheck.c | 301 +++++++++++++++++++-------------------------------- fs/ocfs2/filecheck.h | 20 ++-- fs/ocfs2/ocfs2.h | 8 ++ fs/ocfs2/super.c | 27 ++++- 4 files changed, 152 insertions(+), 204 deletions(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 43477272ab96..a94c5310a59a 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -53,9 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = { "UNSUPPORTED" }; -static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); -static LIST_HEAD(ocfs2_filecheck_sysfs_list); - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; @@ -83,40 +80,84 @@ ocfs2_filecheck_error(int errno) return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf); -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count); -static struct kobj_attribute ocfs2_attr_filecheck_chk = +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_filecheck_attr_chk = __ATTR(check, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_fix = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_fix = __ATTR(fix, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_set = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_set = __ATTR(set, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct attribute *ocfs2_filecheck_attrs[] = { + &ocfs2_filecheck_attr_chk.attr, + &ocfs2_filecheck_attr_fix.attr, + &ocfs2_filecheck_attr_set.attr, + NULL +}; -static int ocfs2_filecheck_sysfs_wait(atomic_t *p) +static void ocfs2_filecheck_release(struct kobject *kobj) { - schedule(); - return 0; + struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + complete(&entry->fs_kobj_unregister); +} + +static ssize_t +ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + kobject_put(kobj); + return ret; +} + +static ssize_t +ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + kobject_put(kobj); + return ret; } +static const struct sysfs_ops ocfs2_filecheck_ops = { + .show = ocfs2_filecheck_show, + .store = ocfs2_filecheck_store, +}; + +static struct kobj_type ocfs2_ktype_filecheck = { + .default_attrs = ocfs2_filecheck_attrs, + .sysfs_ops = &ocfs2_filecheck_ops, + .release = ocfs2_filecheck_release, +}; + static void ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; - if (!atomic_dec_and_test(&entry->fs_count)) - wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&entry->fs_fcheck->fc_lock); while (!list_empty(&entry->fs_fcheck->fc_head)) { p = list_first_entry(&entry->fs_fcheck->fc_head, @@ -127,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) } spin_unlock(&entry->fs_fcheck->fc_lock); - kset_unregister(entry->fs_fcheckkset); - kset_unregister(entry->fs_devicekset); kfree(entry->fs_fcheck); - kfree(entry); -} - -static void -ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) -{ - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); -} - -static int ocfs2_filecheck_sysfs_del(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - list_del(&p->fs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - ocfs2_filecheck_sysfs_free(p); - return 0; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return 1; -} - -static void -ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) -{ - if (atomic_dec_and_test(&entry->fs_count)) - wake_up_atomic_t(&entry->fs_count); + entry->fs_fcheck = NULL; } -static struct ocfs2_filecheck_sysfs_entry * -ocfs2_filecheck_sysfs_get(const char *devname) +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb) { - struct ocfs2_filecheck_sysfs_entry *p = NULL; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - atomic_inc(&p->fs_count); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return p; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return NULL; -} - -int ocfs2_filecheck_create_sysfs(struct super_block *sb) -{ - int ret = 0; - struct kset *device_kset = NULL; - struct kset *fcheck_kset = NULL; - struct ocfs2_filecheck *fcheck = NULL; - struct ocfs2_filecheck_sysfs_entry *entry = NULL; - struct attribute **attrs = NULL; - struct attribute_group attrgp; - - if (!ocfs2_kset) - return -ENOMEM; - - attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); - if (!attrs) { - ret = -ENOMEM; - goto error; - } else { - attrs[0] = &ocfs2_attr_filecheck_chk.attr; - attrs[1] = &ocfs2_attr_filecheck_fix.attr; - attrs[2] = &ocfs2_attr_filecheck_set.attr; - attrs[3] = NULL; - memset(&attrgp, 0, sizeof(attrgp)); - attrgp.attrs = attrs; - } + int ret; + struct ocfs2_filecheck *fcheck; + struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent; fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); - if (!fcheck) { - ret = -ENOMEM; - goto error; - } else { - INIT_LIST_HEAD(&fcheck->fc_head); - spin_lock_init(&fcheck->fc_lock); - fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; - fcheck->fc_size = 0; - fcheck->fc_done = 0; - } - - if (strlen(sb->s_id) <= 0) { - mlog(ML_ERROR, - "Cannot get device basename when create filecheck sysfs\n"); - ret = -ENODEV; - goto error; - } - - device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); - if (!device_kset) { - ret = -ENOMEM; - goto error; - } - - fcheck_kset = kset_create_and_add("filecheck", NULL, - &device_kset->kobj); - if (!fcheck_kset) { - ret = -ENOMEM; - goto error; - } - - ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); - if (ret) - goto error; + if (!fcheck) + return -ENOMEM; - entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); - if (!entry) { - ret = -ENOMEM; - goto error; - } else { - atomic_set(&entry->fs_count, 1); - entry->fs_sb = sb; - entry->fs_devicekset = device_kset; - entry->fs_fcheckkset = fcheck_kset; - entry->fs_fcheck = fcheck; - ocfs2_filecheck_sysfs_add(entry); + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + + entry->fs_kobj.kset = osb->osb_dev_kset; + init_completion(&entry->fs_kobj_unregister); + ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck, + NULL, "filecheck"); + if (ret) { + kfree(fcheck); + return ret; } - kfree(attrs); + entry->fs_fcheck = fcheck; return 0; - -error: - kfree(attrs); - kfree(entry); - kfree(fcheck); - kset_unregister(fcheck_kset); - kset_unregister(device_kset); - return ret; } -int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb) { - return ocfs2_filecheck_sysfs_del(sb->s_id); + if (!osb->osb_fc_ent.fs_fcheck) + return; + + kobject_del(&osb->osb_fc_ent.fs_kobj); + kobject_put(&osb->osb_fc_ent.fs_kobj); + wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister); + ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent); } static int @@ -365,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, return 0; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -373,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, ssize_t ret = 0, total = 0, remain = PAGE_SIZE; unsigned int type; struct ocfs2_filecheck_entry *p; - struct ocfs2_filecheck_sysfs_entry *ent; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) return -EINVAL; - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->name); - return -ENODEV; - } - if (type == OCFS2_FILECHECK_TYPE_SET) { spin_lock(&ent->fs_fcheck->fc_lock); total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); @@ -419,11 +350,10 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, spin_unlock(&ent->fs_fcheck->fc_lock); exit: - ocfs2_filecheck_sysfs_put(ent); return total; } -static int +static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { struct ocfs2_filecheck_entry *p; @@ -469,14 +399,14 @@ ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, } static unsigned int -ocfs2_filecheck_handle(struct super_block *sb, +ocfs2_filecheck_handle(struct ocfs2_super *osb, unsigned long ino, unsigned int flags) { unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; struct inode *inode = NULL; int rc; - inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + inode = ocfs2_iget(osb, ino, flags, 0); if (IS_ERR(inode)) { rc = (int)(-(long)inode); if (rc >= OCFS2_FILECHECK_ERR_START && @@ -494,11 +424,14 @@ static void ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { + struct ocfs2_super *osb = container_of(ent, struct ocfs2_super, + osb_fc_ent); + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); else entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; @@ -506,30 +439,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, ocfs2_filecheck_done_entry(ent, entry); } -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + ssize_t ret = 0; struct ocfs2_filecheck_args args; struct ocfs2_filecheck_entry *entry; - struct ocfs2_filecheck_sysfs_entry *ent; - ssize_t ret = 0; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (count == 0) return count; - if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { - mlog(ML_ERROR, "Invalid arguments for online file check\n"); + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) return -EINVAL; - } - - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->parent->name); - return -ENODEV; - } if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); @@ -544,7 +468,7 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, spin_lock(&ent->fs_fcheck->fc_lock); if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && - (ent->fs_fcheck->fc_done == 0)) { + (ent->fs_fcheck->fc_done == 0)) { mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", @@ -574,6 +498,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - ocfs2_filecheck_sysfs_put(ent); return (!ret ? count : ret); } diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index af1678b620a4..6a22ee79e8d0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -51,15 +51,6 @@ struct ocfs2_filecheck { unsigned int fc_done; /* Finished entry count in list */ }; -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - #define OCFS2_FILECHECK_MAXSIZE 100 #define OCFS2_FILECHECK_MINSIZE 10 @@ -70,7 +61,14 @@ enum { OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ }; -int ocfs2_filecheck_create_sysfs(struct super_block *sb); -int ocfs2_filecheck_remove_sysfs(struct super_block *sb); +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */ + struct kobject fs_kobj; + struct completion fs_kobj_unregister; + struct ocfs2_filecheck *fs_fcheck; +}; + + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb); +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb); #endif /* FILECHECK_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 0c39d71c67a1..3bb169157a01 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -50,6 +50,8 @@ #include "reservations.h" +#include "filecheck.h" + /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of @@ -473,6 +475,12 @@ struct ocfs2_super * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; + + /* sysfs directory per partition */ + struct kset *osb_dev_kset; + + /* file check related stuff */ + struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f03fdd0fc72c..3aa927c681d2 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1162,6 +1162,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); + osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, + &ocfs2_kset->kobj); + if (!osb->osb_dev_kset) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); + goto read_super_error; + } + + /* Create filecheck sysfs related directories/files at + * /sys/fs/ocfs2//filecheck */ + if (ocfs2_filecheck_create_sysfs(osb)) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " + "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); + goto read_super_error; + } + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else @@ -1200,9 +1217,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); - /* Create filecheck sysfile /sys/fs/ocfs2//filecheck */ - ocfs2_filecheck_create_sysfs(sb); - return status; read_super_error: @@ -1654,7 +1668,6 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1900,6 +1913,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + /* Remove file check sysfs related directores/files, + * and wait for the pending file check operations */ + ocfs2_filecheck_remove_sysfs(osb); + + kset_unregister(osb->osb_dev_kset); + debugfs_remove(osb->osb_ctxt); /* Orphan scan should be stopped as early as possible */ -- cgit v1.2.3 From e2d18e1b55f66e82de016205fb7f22ef59a7a1bf Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 15 Jul 2017 11:41:16 +1000 Subject: ocfs2: add duplicated ino number check Add duplicated ino number check, to avoid adding a file into the file check list when this file is being checked. Link: http://lkml.kernel.org/r/1495611866-27360-5-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/filecheck.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index a94c5310a59a..f65f2b2f594d 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -353,6 +353,22 @@ exit: return total; } +static inline int +ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned long ino) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (!p->fe_done) { + if (p->fe_ino == ino) + return 1; + } + } + + return 0; +} + static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { @@ -467,7 +483,10 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, } spin_lock(&ent->fs_fcheck->fc_lock); - if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) { + ret = -EEXIST; + kfree(entry); + } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && (ent->fs_fcheck->fc_done == 0)) { mlog(ML_NOTICE, "Cannot do more file check " -- cgit v1.2.3 From e81d9956cd86e4df47ec222b3b8001b14f827311 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Sat, 15 Jul 2017 11:41:17 +1000 Subject: block: restore /proc/partitions to not display non-partitionable removable devices We found with newer kernels we started seeing the cdrom device showing up in /proc/partitions, but it was not there before. Looking into this I found that commit d27769ec ("block: add GENHD_FL_NO_PART_SCAN") introduces this change in behavior. It's not clear to me from the commit's changelog if this change was intentional or not. This comment still remains: /* Don't show non-partitionable removeable devices or empty devices */ so I've decided to send a patch to restore the behavior of not printing unpartitionable removable devices. Signed-off-by: Josh Hunt Cc: Tejun Heo Cc: Kay Sievers Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 7f520fa25d16..9dc591678627 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -890,7 +890,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) -- cgit v1.2.3 From 1c68a50d0904a1b2cf399a866b762dca298da5cf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Sat, 15 Jul 2017 11:41:17 +1000 Subject: slub: tidy up initialization ordering - free_kmem_cache_nodes() frees the cache node before nulling out a reference to it - init_kmem_cache_nodes() publishes the cache node before initializing it Neither of these matter at runtime because the cache nodes cannot be looked up by any other thread. But it's neater and more consistent to reorder these. Link: http://lkml.kernel.org/r/20170707083408.40410-1-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1d3f9835f4ea..5ac0b5ad029b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3358,8 +3358,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { - kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; + kmem_cache_free(kmem_cache_node, n); } } @@ -3389,8 +3389,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 0; } - s->node[node] = n; init_kmem_cache_node(n); + s->node[node] = n; } return 1; } -- cgit v1.2.3 From 6379d071d06ef44e0e07c054680a8cdd87cb797e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 15 Jul 2017 11:41:17 +1000 Subject: mm/memory_hotplug: just build zonelist for newly added node 9adb62a5df9c0fbef7 ("mm/hotplug: correctly setup fallback zonelists when creating new pgdat") tries to build the correct zonelist for a newly added node, while it is not necessary to rebuild it for already exist nodes. In build_zonelists(), it will iterate on nodes with memory. For a newly added node, it will have memory until node_states_set_node() is called in online_pages(). This patch avoids rebuilding the zonelists for already existing nodes. build_zonelists_node() uses managed_zone(zone) checks, so it should not include empty zones anyway. So effectively we avoid some pointless work under stop_machine(). Link: http://lkml.kernel.org/r/20170626035822.50155-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Jiang Liu Cc: Xishi Qiu Signed-off-by: Andrew Morton --- mm/page_alloc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d30e914afb6..7392e4ff628f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5213,15 +5213,17 @@ static int __build_all_zonelists(void *data) memset(node_load, 0, sizeof(node_load)); #endif - if (self && !node_online(self->node_id)) { + /* This node is hotadded and no memory preset yet. + * So just build zonelists is fine, no need to touch other nodes. + */ + if (self && !node_online(self->node_id)) build_zonelists(self); - } - - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); + else + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); - build_zonelists(pgdat); - } + build_zonelists(pgdat); + } /* * Initialize the boot_pagesets that are going to be used -- cgit v1.2.3 From 5c865b2316e6ec459c5170a1b9e7fa77b79b8b9b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 15 Jul 2017 11:41:17 +1000 Subject: mm-memory_hotplug-just-build-zonelist-for-new-added-node-fix tweak comment text Cc: Jiang Liu Cc: Michal Hocko Cc: Vlastimil Babka Cc: Wei Yang Cc: Xishi Qiu Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7392e4ff628f..bb2d08167bbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5213,8 +5213,9 @@ static int __build_all_zonelists(void *data) memset(node_load, 0, sizeof(node_load)); #endif - /* This node is hotadded and no memory preset yet. - * So just build zonelists is fine, no need to touch other nodes. + /* + * This node is hotadded and no memory is yet present. So just + * building zonelists is fine - no need to touch other nodes. */ if (self && !node_online(self->node_id)) build_zonelists(self); -- cgit v1.2.3 From 5ee2dcffffc03fcc7056b7d5defc4f97ced4bc9a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 15 Jul 2017 11:41:17 +1000 Subject: mm-memory_hotplug-just-build-zonelist-for-new-added-node-fix-fix coding-style tweak, per Vlastimil Cc: Jiang Liu Cc: Michal Hocko Cc: Vlastimil Babka Cc: Wei Yang Cc: Xishi Qiu Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb2d08167bbf..f75c18307beb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5217,14 +5217,15 @@ static int __build_all_zonelists(void *data) * This node is hotadded and no memory is yet present. So just * building zonelists is fine - no need to touch other nodes. */ - if (self && !node_online(self->node_id)) + if (self && !node_online(self->node_id)) { build_zonelists(self); - else + } else { for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); } + } /* * Initialize the boot_pagesets that are going to be used -- cgit v1.2.3 From 3301171f809f27b7ad49a6f35cae4b501e8803d0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 15 Jul 2017 11:41:17 +1000 Subject: mm/page_alloc: return 0 in case this node has no page within the zone The whole memory space is divided into several zones and nodes may have no page in some zones. In this case, the __absent_pages_in_range() would return 0, since the range it is searching for is an empty range. Also this happens more often to those nodes with higher memory range when there are more nodes, which is a trend for future architectures. This patch checks the zone range after clamp and adjustment, return 0 if the range is an empty range. Link: http://lkml.kernel.org/r/20170206154314.15705-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f75c18307beb..80e4adb4c360 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5866,6 +5866,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); + + /* If this node has no page within this zone, return 0. */ + if (zone_start_pfn == zone_end_pfn) + return 0; + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); /* -- cgit v1.2.3 From 187e364b7dfa13b18a13f250707f32ae13f40891 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Sat, 15 Jul 2017 11:41:17 +1000 Subject: mm: vmscan: do not pass reclaimed slab to vmpressure During global reclaim, the nr_reclaimed passed to vmpressure includes the pages reclaimed from slab. But the corresponding scanned slab pages is not passed. There is an impact to the vmpressure values because of this. While moving from kernel version 3.18 to 4.4, a difference is seen in the vmpressure values for the same workload resulting in a different behaviour of the vmpressure consumer. One such case is of a vmpressure based lowmemorykiller. It is observed that the vmpressure events are received late and less in number resulting in tasks not being killed at the right time. In this use case, The number of critical vmpressure events received is around 50% less on 4.4 than 3.18. The following numbers show the impact on reclaim activity due to the change in behaviour of lowmemorykiller on a 4GB device. The test launches a number of apps in sequence and repeats it multiple times. The difference in reclaim behaviour is because of lesser number of kills and kills happening late, resulting in more swapping and page cache reclaim. v4.4 v3.18 pgpgin 163016456 145617236 pgpgout 4366220 4188004 workingset_refault 29857868 26781854 workingset_activate 6293946 5634625 pswpin 1327601 1133912 pswpout 3593842 3229602 pgalloc_dma 99520618 94402970 pgalloc_normal 104046854 98124798 pgfree 203772640 192600737 pgmajfault 2126962 1851836 pgsteal_kswapd_dma 19732899 18039462 pgsteal_kswapd_normal 19945336 17977706 pgsteal_direct_dma 206757 131376 pgsteal_direct_normal 236783 138247 pageoutrun 116622 108370 allocstall 7220 4684 compact_stall 931 856 The lowmemorykiller example above is just for indicating the difference in vmpressure events between 4.4 and 3.18. Do not consider reclaimed slab pages for vmpressure calculation. The reclaimed pages from slab can be excluded because the freeing of a page by slab shrinking depends on each slab's object population, making the cost model (i.e. scan:free) different from that of LRU. Also, not every shrinker accounts the pages it reclaims. Ideally the pages reclaimed from slab should be passed to vmpressure, otherwise higher vmpressure levels can be triggered even when there is a reclaim progress. But accounting only the reclaimed slab pages without the scanned, and adding something which does not fit into the cost model just adds noise to the vmpressure values. Fixes: 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") Link: http://lkml.kernel.org/r/1486641577-11685-2-git-send-email-vinmenon@codeaurora.org Signed-off-by: Vinayak Menon Acked-by: Minchan Kim Cc: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Rik van Riel Cc: Vladimir Davydov Cc: Anton Vorontsov Cc: Shiraz Hashim Signed-off-by: Andrew Morton --- mm/vmscan.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a1af041930a6..efc9da21c5e6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2632,16 +2632,23 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - nr_scanned, node_lru_pages); + /* + * Record the subtree's reclaim efficiency. The reclaimed + * pages from slab is excluded here because the corresponding + * scanned pages is not accounted. Moreover, freeing a page + * by slab shrinking depends on each slab's object population, + * making the cost model (i.e. scan:free) different from that + * of LRU. + */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; -- cgit v1.2.3 From dc52c5c660fd03d25f9f859d60e3a7685e437fa1 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 15 Jul 2017 11:41:18 +1000 Subject: mm/page_owner: align with pageblock_nr pages When pfn_valid(pfn) returns false, pfn should be aligned with pageblock_nr_pages other than MAX_ORDER_NR_PAGES in init_pages_in_zone, because the skipped 2M may be valid pfn, as a result, early allocated count will not be accurate. Link: http://lkml.kernel.org/r/1468938136-24228-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 0fd9dcf2c5dc..356828237581 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -531,7 +531,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } -- cgit v1.2.3 From dcb37f7e16fec90e4e92608c7053324f75b2a501 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 15 Jul 2017 11:41:18 +1000 Subject: mm/vmstat.c: walk the zone in pageblock_nr_pages steps when walking the zone, we can happens to the holes. we should not align MAX_ORDER_NR_PAGES, so it can skip the normal memory. In addition, pagetypeinfo_showmixedcount_print reflect fragmentization. we hope to get more accurate data. therefore, I decide to fix it. Link: http://lkml.kernel.org/r/1469502526-24486-2-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 356828237581..401feb070335 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } -- cgit v1.2.3 From 6b0f6a1f8c6eeca5d6d6e35751bfd40f63195b38 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 15 Jul 2017 11:41:18 +1000 Subject: fs/seq_file.c: delete small-value optimization num_to_str() optimizes printing small integers [0..9], so the same check higher in callchain is unnecessary. Link: http://lkml.kernel.org/r/20170516204246.GA18123@avx2 Signed-off-by: Alexey Dobriyan Cc: Joe Perches Signed-off-by: Andrew Morton --- fs/seq_file.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index dc7c2be963ed..13e8c092d4d2 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -694,11 +694,6 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, if (m->count + 1 >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; @@ -733,11 +728,6 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num num = -num; } - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; -- cgit v1.2.3 From 3beb8be37f7e1347b18d4c43997b18a09fcc6c85 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Sat, 15 Jul 2017 11:41:18 +1000 Subject: kdump, vmcoreinfo: report actual value of phys_base Currently, VMCOREINFO note information reports the virtual address of phys_base that is assigned to symbol phys_base. But this doesn't make sense because to refer to phys_base, it's necessary to get the value of phys_base itself we are now about to refer to. Userland tools related to kdump such as makedumpfile and crash utility so far have made some efforts to calculate phys_base on crash dump formats generated by mechanisms running outside Linux kernel, such as virtual machine hypervisor such as qemu dump, which ordinary users use via virsh dump, or ones implemented on vendor specific firmware. That is, find a kernel data whose virtual and physical addresses are available via its note information and calculate phys_base from it. However, such data structure is not the one prepared for phys_base purpose. There's no guarantee that other crash dump mechanisms include such information that can be used to calculate phys_base similarly. To get VMCOREINFO in vmcore, it's easy to use strings and grep commands like this; VMCOREINFO consists of simple string: $ strings vmcore-3.10.0-121.el7.x86_64 | grep -E ".*VMCOREINFO.*" -A 100 VMCOREINFO OSRELEASE=3.10.0-121.el7.x86_64 PAGESIZE=4096 ... This is also useful to get value of phys_base in kdump 2nd kernel contained in vmcore using the above-mentioned external crash dump mechanism; kdump 2nd kernel is an inherently relocated kernel. This commit doesn't remove VMCOREINFO_SYMBOL(phys_base) line because makedumpfile refers to it and if removing it, old versions makedumpfile doesn't work well. Signed-off-by: HATAYAMA Daisuke Cc: Eric W. Biederman Cc: Vivek Goyal Cc: Atsushi Kumagai Cc: Dave Anderson Signed-off-by: Andrew Morton --- arch/x86/kernel/machine_kexec_64.c | 1 + include/linux/crash_core.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cb0a30473c23..df2ac62298c0 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -356,6 +356,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 2df2118fbe13..fffe674cad27 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -56,6 +56,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern u32 *vmcoreinfo_note; -- cgit v1.2.3 From 46f7f69c3e1a838c07725b7aa1ef547c5ad232b0 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sat, 15 Jul 2017 11:41:19 +1000 Subject: uapi: fix linux/sysctl.h userspace compilation errors Include (guarded by #ifndef __KERNEL__) to fix the following linux/sysctl.h userspace compilation errors: /usr/include/linux/sysctl.h:38:2: error: unknown type name 'size_t' size_t *oldlenp; /usr/include/linux/sysctl.h:40:2: error: unknown type name 'size_t' size_t newlen; This also fixes userspace compilation of uapi headers that include linux/sysctl.h, e.g. linux/netfilter.h. Link: http://lkml.kernel.org/r/20170222230652.GA14373@altlinux.org Signed-off-by: Dmitry V. Levin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- include/uapi/linux/sysctl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index e13d48058b8d..c6602bdd2a10 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -26,6 +26,10 @@ #include #include +#ifndef __KERNEL__ +#include /* For size_t. */ +#endif + #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is the largest acceptable value for the nlen -- cgit v1.2.3 From 86119af309c6b8e74b6cb22d7bf388cff3ca072b Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Sat, 15 Jul 2017 11:41:19 +1000 Subject: kernel/reboot.c: add devm_register_reboot_notifier() Add devm_* wrapper around register_reboot_notifier to simplify device specific reboot notifier registration/unregistration. Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com Signed-off-by: Andrey Smirnov Signed-off-by: Andrew Morton --- include/linux/reboot.h | 3 +++ kernel/reboot.c | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index a7ff409f386d..0ca25413ad2d 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -38,6 +38,9 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); +struct device; +extern int devm_register_reboot_notifier(struct device *, struct notifier_block *); + extern int register_restart_handler(struct notifier_block *); extern int unregister_restart_handler(struct notifier_block *); extern void do_kernel_restart(char *cmd); diff --git a/kernel/reboot.c b/kernel/reboot.c index bd30a973fe94..e4ced883d8de 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +static void devm_unregister_reboot_notifier(struct device *dev, void *res) +{ + WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res)); +} + +int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb) +{ + struct notifier_block **rcnb; + int ret; + + rcnb = devres_alloc(devm_unregister_reboot_notifier, + sizeof(*rcnb), GFP_KERNEL); + if (!rcnb) + return -ENOMEM; + + ret = register_reboot_notifier(nb); + if (!ret) { + *rcnb = nb; + devres_add(dev, rcnb); + } else { + devres_free(rcnb); + } + + return ret; +} +EXPORT_SYMBOL(devm_register_reboot_notifier); + /* * Notifier list for kernel code which wants to be called * to restart the system. -- cgit v1.2.3 From 7f004d4884d4b78c6ac68cc6cd6a105f21ed047b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 15 Jul 2017 11:41:19 +1000 Subject: kernel-reboot-add-devm_register_reboot_notifier-fix move `struct device' forward declaration to top-of-file Cc: Andrey Smirnov Signed-off-by: Andrew Morton --- include/linux/reboot.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 0ca25413ad2d..ecbf7b56b9db 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -5,6 +5,8 @@ #include #include +struct device; + #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN #define SYS_HALT 0x0002 /* Notify of system halt */ @@ -38,7 +40,6 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); -struct device; extern int devm_register_reboot_notifier(struct device *, struct notifier_block *); extern int register_restart_handler(struct notifier_block *); -- cgit v1.2.3