From ca7cc5110a313a609da40ae948978a585352564b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:13 +0800 Subject: ACPI, APEI, ERST, Prevent erst_dbg from loading if ERST is disabled erst_dbg module can not work when ERST is disabled. So disable module loading to provide clearer information to user. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/erst-dbg.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c index a4cfb64c86a..cb04aa43c33 100644 --- a/drivers/acpi/apei/erst-dbg.c +++ b/drivers/acpi/apei/erst-dbg.c @@ -213,6 +213,10 @@ static struct miscdevice erst_dbg_dev = { static __init int erst_dbg_init(void) { + if (erst_disable) { + pr_info(ERST_DBG_PFX "ERST support is disabled.\n"); + return -ENODEV; + } return misc_register(&erst_dbg_dev); } -- cgit v1.2.3 From d37afc50e618271839f001ea653949eefc728167 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Wed, 13 Jul 2011 13:14:14 +0800 Subject: ACPI, APEI, ERST, Fix erst-dbg long record reading issue When we debug ERST table with erst-dbg, if the error record in ERST table is too long(>4K), it can't be read out. So this patch increases the buffer size to 16K to ensure such error records can be read from ERST table. Signed-off-by: Chen Gong Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/erst-dbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c index cb04aa43c33..903549df809 100644 --- a/drivers/acpi/apei/erst-dbg.c +++ b/drivers/acpi/apei/erst-dbg.c @@ -33,7 +33,7 @@ #define ERST_DBG_PFX "ERST DBG: " -#define ERST_DBG_RECORD_LEN_MAX 4096 +#define ERST_DBG_RECORD_LEN_MAX 0x4000 static void *erst_dbg_buf; static unsigned int erst_dbg_buf_len; -- cgit v1.2.3 From 5588340d46a484da53bbce8136184d9c7fbc259c Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:15 +0800 Subject: ACPI, APEI, GHES, Do not ratelimit fatal error printk before panic printk is used by GHES to report hardware errors. Normally, the printk will be ratelimited to avoid too many hardware error reports in kernel log. Because there may be thousands or even millions of corrected hardware errors during system running. That is different for fatal hardware error, because system will go panic as soon as possible, there will be no more than several error records. And these error records are valuable for system fault diagnosis, so they should not be ratelimited. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/ghes.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index f703b288115..f339c0f8369 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -360,11 +360,8 @@ static void ghes_do_proc(struct ghes *ghes) } } -static void ghes_print_estatus(const char *pfx, struct ghes *ghes) +static void __ghes_print_estatus(const char *pfx, struct ghes *ghes) { - /* Not more than 2 messages every 5 seconds */ - static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); - if (pfx == NULL) { if (ghes_severity(ghes->estatus->error_severity) <= GHES_SEV_CORRECTED) @@ -372,12 +369,18 @@ static void ghes_print_estatus(const char *pfx, struct ghes *ghes) else pfx = KERN_ERR HW_ERR; } - if (__ratelimit(&ratelimit)) { - printk( - "%s""Hardware error from APEI Generic Hardware Error Source: %d\n", - pfx, ghes->generic->header.source_id); - apei_estatus_print(pfx, ghes->estatus); - } + printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n", + pfx, ghes->generic->header.source_id); + apei_estatus_print(pfx, ghes->estatus); +} + +static void ghes_print_estatus(const char *pfx, struct ghes *ghes) +{ + /* Not more than 2 messages every 5 seconds */ + static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); + + if (__ratelimit(&ratelimit)) + __ghes_print_estatus(pfx, ghes); } static int ghes_proc(struct ghes *ghes) @@ -476,7 +479,7 @@ static int ghes_notify_nmi(struct notifier_block *this, if (sev_global >= GHES_SEV_PANIC) { oops_begin(); - ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); + __ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); /* reboot to log the error! */ if (panic_timeout == 0) panic_timeout = ghes_panic_timeout; -- cgit v1.2.3 From eecf2f7124834dd1cad21807526a8ea031ba8217 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:16 +0800 Subject: ACPI, APEI, Add apei_exec_run_optional Some actions in APEI ERST and EINJ tables are optional, for example, ACPI_EINJ_BEGIN_OPERATION action is used to do some preparation for error injection, and firmware may choose to do nothing here. While some other actions are mandatory, for example, firmware must provide ACPI_EINJ_GET_ERROR_TYPE implementation. Original implementation treats all actions as optional (that is, can have no instructions), that may cause issue if firmware does not provide some mandatory actions. To fix this, this patch adds apei_exec_run_optional, which should be used for optional actions. The original apei_exec_run should be used for mandatory actions. Cc: Thomas Renninger Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/apei-base.c | 9 +++++---- drivers/acpi/apei/apei-internal.h | 13 ++++++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index 4a904a4bf05..0714194229d 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c @@ -157,9 +157,10 @@ EXPORT_SYMBOL_GPL(apei_exec_noop); * Interpret the specified action. Go through whole action table, * execute all instructions belong to the action. */ -int apei_exec_run(struct apei_exec_context *ctx, u8 action) +int __apei_exec_run(struct apei_exec_context *ctx, u8 action, + bool optional) { - int rc; + int rc = -ENOENT; u32 i, ip; struct acpi_whea_header *entry; apei_exec_ins_func_t run; @@ -198,9 +199,9 @@ rewind: goto rewind; } - return 0; + return !optional && rc < 0 ? rc : 0; } -EXPORT_SYMBOL_GPL(apei_exec_run); +EXPORT_SYMBOL_GPL(__apei_exec_run); typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx, struct acpi_whea_header *entry, diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index ef0581f2094..f286cf753f3 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h @@ -50,7 +50,18 @@ static inline u64 apei_exec_ctx_get_output(struct apei_exec_context *ctx) return ctx->value; } -int apei_exec_run(struct apei_exec_context *ctx, u8 action); +int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool optional); + +static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action) +{ + return __apei_exec_run(ctx, action, 0); +} + +/* It is optional whether the firmware provides the action */ +static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action) +{ + return __apei_exec_run(ctx, action, 1); +} /* Common instruction implementation */ -- cgit v1.2.3 From 392913de7cc7446531922f29c0a4382d8d09626c Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:17 +0800 Subject: ACPI, APEI, Use apei_exec_run_optional in APEI EINJ and ERST This patch changes APEI EINJ and ERST to use apei_exec_run for mandatory actions, and apei_exec_run_optional for optional actions. Cc: Thomas Renninger Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/einj.c | 4 ++-- drivers/acpi/apei/erst.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index f74b2ea11f2..ab821f1cfa1 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -285,7 +285,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2) einj_exec_ctx_init(&ctx); - rc = apei_exec_run(&ctx, ACPI_EINJ_BEGIN_OPERATION); + rc = apei_exec_run_optional(&ctx, ACPI_EINJ_BEGIN_OPERATION); if (rc) return rc; apei_exec_ctx_set_input(&ctx, type); @@ -323,7 +323,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2) rc = __einj_error_trigger(trigger_paddr); if (rc) return rc; - rc = apei_exec_run(&ctx, ACPI_EINJ_END_OPERATION); + rc = apei_exec_run_optional(&ctx, ACPI_EINJ_END_OPERATION); return rc; } diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index e6cef8e1b53..c8ba9da14e4 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -642,7 +642,7 @@ static int __erst_write_to_storage(u64 offset) int rc; erst_exec_ctx_init(&ctx); - rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_WRITE); + rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE); if (rc) return rc; apei_exec_ctx_set_input(&ctx, offset); @@ -666,7 +666,7 @@ static int __erst_write_to_storage(u64 offset) if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); - rc = apei_exec_run(&ctx, ACPI_ERST_END); + rc = apei_exec_run_optional(&ctx, ACPI_ERST_END); if (rc) return rc; @@ -681,7 +681,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset) int rc; erst_exec_ctx_init(&ctx); - rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_READ); + rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ); if (rc) return rc; apei_exec_ctx_set_input(&ctx, offset); @@ -709,7 +709,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset) if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); - rc = apei_exec_run(&ctx, ACPI_ERST_END); + rc = apei_exec_run_optional(&ctx, ACPI_ERST_END); if (rc) return rc; @@ -724,7 +724,7 @@ static int __erst_clear_from_storage(u64 record_id) int rc; erst_exec_ctx_init(&ctx); - rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_CLEAR); + rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR); if (rc) return rc; apei_exec_ctx_set_input(&ctx, record_id); @@ -748,7 +748,7 @@ static int __erst_clear_from_storage(u64 record_id) if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); - rc = apei_exec_run(&ctx, ACPI_ERST_END); + rc = apei_exec_run_optional(&ctx, ACPI_ERST_END); if (rc) return rc; -- cgit v1.2.3 From 86cd47334b00b6aa9b5d0ebf389a6fe76f21c641 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:18 +0800 Subject: ACPI, APEI, GHES, Prevent GHES to be built as module GHES (Generic Hardware Error Source) is used to process hardware error notification in firmware first mode. But because firmware first mode can be turned on but can not be turned off, it is unreasonable to unload the GHES module with firmware first mode turned on. To avoid confusion, this patch makes GHES can be enabled/disabled in configuration time, but not built as module and unloaded at run time. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Reviewed-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/acpi/apei/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index f739a70b1c7..3f45dde17ae 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig @@ -10,7 +10,7 @@ config ACPI_APEI error injection. config ACPI_APEI_GHES - tristate "APEI Generic Hardware Error Source" + bool "APEI Generic Hardware Error Source" depends on ACPI_APEI && X86 select ACPI_HED help -- cgit v1.2.3 From b6a9501658530d8b8374e37f1edb549039a8a260 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:19 +0800 Subject: ACPI, APEI, GHES, Support disable GHES at boot time Some machine may have broken firmware so that GHES and firmware first mode should be disabled. This patch adds support to that. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Reviewed-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/acpi/apei/ghes.c | 8 ++++++++ drivers/acpi/apei/hest.c | 17 +++++++++-------- include/acpi/apei.h | 1 + 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index f339c0f8369..b142b94bf8b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -77,6 +77,9 @@ struct ghes { }; }; +int ghes_disable; +module_param_named(disable, ghes_disable, bool, 0); + static int ghes_panic_timeout __read_mostly = 30; /* @@ -665,6 +668,11 @@ static int __init ghes_init(void) return -EINVAL; } + if (ghes_disable) { + pr_info(GHES_PFX "GHES is not enabled!\n"); + return -EINVAL; + } + rc = ghes_ioremap_init(); if (rc) goto err; diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index 181bc2f7bb7..05fee06f4d6 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c @@ -231,16 +231,17 @@ void __init acpi_hest_init(void) goto err; } - rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); - if (rc) - goto err; - - rc = hest_ghes_dev_register(ghes_count); - if (!rc) { - pr_info(HEST_PFX "Table parsing has been initialized.\n"); - return; + if (!ghes_disable) { + rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); + if (rc) + goto err; + rc = hest_ghes_dev_register(ghes_count); + if (rc) + goto err; } + pr_info(HEST_PFX "Table parsing has been initialized.\n"); + return; err: hest_disable = 1; } diff --git a/include/acpi/apei.h b/include/acpi/apei.h index e67b523a50e..d40bc5521fc 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -18,6 +18,7 @@ extern int hest_disable; extern int erst_disable; +extern int ghes_disable; #ifdef CONFIG_ACPI_APEI void __init acpi_hest_init(void); -- cgit v1.2.3 From eccddd32ced0df8f9130024157bf8d37df860d76 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:20 +0800 Subject: ACPI, APEI, Add APEI bit support in generic _OSC call In APEI firmware first mode, hardware error is reported by hardware to firmware firstly, then firmware reports the error to Linux in a GHES error record via POLL/SCI/IRQ/NMI etc. This may result in some issues if OS has no full APEI support. So some firmware implementation will work in a back-compatible mode by default. Where firmware will only notify OS in old-fashion, without GHES record. For example, for a fatal hardware error, only NMI is signaled, no GHES record. To gain full APEI power on these machines, APEI bit in generic _OSC call can be specified to tell firmware that Linux has full APEI support. This patch adds the APEI bit support in generic _OSC call. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Reviewed-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/acpi/bus.c | 16 ++++++++++++++-- include/linux/acpi.h | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index d1e06c182cd..43ce3b0c492 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -519,6 +520,7 @@ out_kfree: } EXPORT_SYMBOL(acpi_run_osc); +bool osc_sb_apei_support_acked; static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; static void acpi_bus_osc_support(void) { @@ -541,11 +543,21 @@ static void acpi_bus_osc_support(void) #if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE) capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT; #endif + +#ifdef CONFIG_ACPI_APEI_GHES + if (!ghes_disable) + capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_APEI_SUPPORT; +#endif if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; - if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) + if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) { + u32 *capbuf_ret = context.ret.pointer; + if (context.ret.length > OSC_SUPPORT_TYPE) + osc_sb_apei_support_acked = + capbuf_ret[OSC_SUPPORT_TYPE] & OSC_SB_APEI_SUPPORT; kfree(context.ret.pointer); - /* do we need to check the returned cap? Sounds no */ + } + /* do we need to check other returned cap? Sounds no */ } /* -------------------------------------------------------------------------- diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 1deb2a73c2d..e19527de6a9 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -280,6 +280,8 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_SB_CPUHP_OST_SUPPORT 8 #define OSC_SB_APEI_SUPPORT 16 +extern bool osc_sb_apei_support_acked; + /* PCI defined _OSC bits */ /* _OSC DW1 Definition (OS Support Fields) */ #define OSC_EXT_PCI_CONFIG_SUPPORT 1 -- cgit v1.2.3 From 9fb0bfe1408d5506b7b83d13d1eed573fd71d67d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:21 +0800 Subject: ACPI, APEI, Add WHEA _OSC support APEI firmware first mode must be turned on explicitly on some machines, otherwise there may be no GHES hardware error record for hardware error notification. APEI bit in generic _OSC call can be used to do that, but on some machine, a special WHEA _OSC call must be used. This patch adds the support to that WHEA _OSC call. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Reviewed-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/acpi/apei/apei-base.c | 26 ++++++++++++++++++++++++++ drivers/acpi/apei/apei-internal.h | 2 ++ drivers/acpi/apei/ghes.c | 10 ++++++++++ 3 files changed, 38 insertions(+) diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index 0714194229d..8041248fce9 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c @@ -604,3 +604,29 @@ struct dentry *apei_get_debugfs_dir(void) return dapei; } EXPORT_SYMBOL_GPL(apei_get_debugfs_dir); + +int apei_osc_setup(void) +{ + static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c"; + acpi_handle handle; + u32 capbuf[3]; + struct acpi_osc_context context = { + .uuid_str = whea_uuid_str, + .rev = 1, + .cap.length = sizeof(capbuf), + .cap.pointer = capbuf, + }; + + capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE; + capbuf[OSC_SUPPORT_TYPE] = 0; + capbuf[OSC_CONTROL_TYPE] = 0; + + if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)) + || ACPI_FAILURE(acpi_run_osc(handle, &context))) + return -EIO; + else { + kfree(context.ret.pointer); + return 0; + } +} +EXPORT_SYMBOL_GPL(apei_osc_setup); diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index f286cf753f3..f57050e7a5e 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h @@ -124,4 +124,6 @@ void apei_estatus_print(const char *pfx, const struct acpi_hest_generic_status *estatus); int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus); int apei_estatus_check(const struct acpi_hest_generic_status *estatus); + +int apei_osc_setup(void); #endif diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index b142b94bf8b..b1390a61cde 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -681,6 +681,16 @@ static int __init ghes_init(void) if (rc) goto err_ioremap_exit; + rc = apei_osc_setup(); + if (rc == 0 && osc_sb_apei_support_acked) + pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n"); + else if (rc == 0 && !osc_sb_apei_support_acked) + pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n"); + else if (rc && osc_sb_apei_support_acked) + pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n"); + else + pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); + return 0; err_ioremap_exit: ghes_ioremap_exit(); -- cgit v1.2.3 From df013ffb8119c89f062ab05b7f544704315db47b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:22 +0800 Subject: Add Kconfig option ARCH_HAVE_NMI_SAFE_CMPXCHG cmpxchg() is widely used by lockless code, including NMI-safe lockless code. But on some architectures, the cmpxchg() implementation is not NMI-safe, on these architectures the lockless code may need a spin_trylock_irqsave() based implementation. This patch adds a Kconfig option: ARCH_HAVE_NMI_SAFE_CMPXCHG, so that NMI-safe lockless code can depend on it or provide different implementation according to it. On many architectures, cmpxchg is only NMI-safe for several specific operand sizes. So, ARCH_HAVE_NMI_SAFE_CMPXCHG define in this patch only guarantees cmpxchg is NMI-safe for sizeof(unsigned long). Signed-off-by: Huang Ying Acked-by: Mike Frysinger Acked-by: Paul Mundt Acked-by: Hans-Christian Egtvedt Acked-by: Benjamin Herrenschmidt Acked-by: Chris Metcalf Acked-by: Richard Henderson CC: Mikael Starvik Acked-by: David Howells CC: Yoshinori Sato CC: Tony Luck CC: Hirokazu Takata CC: Geert Uytterhoeven CC: Michal Simek Acked-by: Ralf Baechle CC: Kyle McMartin CC: Martin Schwidefsky CC: Chen Liqin CC: "David S. Miller" CC: Ingo Molnar CC: Chris Zankel Signed-off-by: Len Brown --- arch/Kconfig | 3 +++ arch/alpha/Kconfig | 1 + arch/avr32/Kconfig | 1 + arch/frv/Kconfig | 1 + arch/ia64/Kconfig | 1 + arch/m68k/Kconfig | 1 + arch/parisc/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + 13 files changed, 15 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 26b0e2397a5..4b0669cbb3b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -178,4 +178,7 @@ config HAVE_ARCH_MUTEX_CPU_RELAX config HAVE_RCU_TABLE_FREE bool +config ARCH_HAVE_NMI_SAFE_CMPXCHG + bool + source "kernel/gcov/Kconfig" diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 60219bf9419..23f8d8c3288 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -13,6 +13,7 @@ config ALPHA select AUTO_IRQ_AFFINITY if SMP select GENERIC_IRQ_SHOW select ARCH_WANT_OPTIONAL_GPIOLIB + select ARCH_HAVE_NMI_SAFE_CMPXCHG help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index e9d689b7c83..197e96f7040 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -10,6 +10,7 @@ config AVR32 select GENERIC_IRQ_PROBE select HARDIRQS_SW_RESEND select GENERIC_IRQ_SHOW + select ARCH_HAVE_NMI_SAFE_CMPXCHG help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index cb884e48942..bad27a6ff40 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -7,6 +7,7 @@ config FRV select HAVE_PERF_EVENTS select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW + select ARCH_HAVE_NMI_SAFE_CMPXCHG config ZONE_DMA bool diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 38280ef4a2a..b1227ddc672 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -27,6 +27,7 @@ config IA64 select GENERIC_PENDING_IRQ if SMP select IRQ_PER_CPU select GENERIC_IRQ_SHOW + select ARCH_HAVE_NMI_SAFE_CMPXCHG default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index d66e34c718d..effe8f0fa31 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -6,6 +6,7 @@ config M68K select GENERIC_ATOMIC64 if MMU select HAVE_GENERIC_HARDIRQS if !MMU select GENERIC_IRQ_SHOW if !MMU + select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS config RWSEM_GENERIC_SPINLOCK bool diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 65adc86a230..e077b0bf56c 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -15,6 +15,7 @@ config PARISC select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select IRQ_PER_CPU + select ARCH_HAVE_NMI_SAFE_CMPXCHG help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2729c6663d8..e55f754dd50 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -134,6 +134,7 @@ config PPC select GENERIC_IRQ_SHOW_LEVEL select HAVE_RCU_TABLE_FREE if SMP select HAVE_SYSCALL_TRACEPOINTS + select ARCH_HAVE_NMI_SAFE_CMPXCHG config EARLY_PRINTK bool diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c03fef7a9c2..0f98bbddade 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -81,6 +81,7 @@ config S390 select INIT_ALL_POSSIBLE select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select ARCH_HAVE_NMI_SAFE_CMPXCHG select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index bbdeb48bbf8..66405eb2176 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -11,6 +11,7 @@ config SUPERH select HAVE_DMA_ATTRS select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select PERF_USE_VMALLOC select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 253986bd6bb..a63cddfb3a1 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -53,6 +53,7 @@ config SPARC64 select HAVE_PERF_EVENTS select PERF_USE_VMALLOC select IRQ_PREFLOW_FASTEOI + select ARCH_HAVE_NMI_SAFE_CMPXCHG config ARCH_DEFCONFIG string diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 0249b8b4db5..b30f71ac0d0 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -12,6 +12,7 @@ config TILE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW select SYS_HYPERVISOR + select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386 # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da349723d41..a680a60898d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -70,6 +70,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) + select ARCH_HAVE_NMI_SAFE_CMPXCHG config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) -- cgit v1.2.3 From f49f23abf3dd786ddcac1c1e7db3c2013b07413f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:23 +0800 Subject: lib, Add lock-less NULL terminated single list Cmpxchg is used to implement adding new entry to the list, deleting all entries from the list, deleting first entry of the list and some other operations. Because this is a single list, so the tail can not be accessed in O(1). If there are multiple producers and multiple consumers, llist_add can be used in producers and llist_del_all can be used in consumers. They can work simultaneously without lock. But llist_del_first can not be used here. Because llist_del_first depends on list->first->next does not changed if list->first is not changed during its operation, but llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, llist_add) sequence in another consumer may violate that. If there are multiple producers and one consumer, llist_add can be used in producers and llist_del_all or llist_del_first can be used in the consumer. This can be summarized as follow: | add | del_first | del_all add | - | - | - del_first | | L | L del_all | | | - Where "-" stands for no lock is needed, while "L" stands for lock is needed. The list entries deleted via llist_del_all can be traversed with traversing function such as llist_for_each etc. But the list entries can not be traversed safely before deleted from the list. The order of deleted entries is from the newest to the oldest added one. If you want to traverse from the oldest to the newest, you must reverse the order by yourself before traversing. The basic atomic operation of this list is cmpxchg on long. On architectures that don't have NMI-safe cmpxchg implementation, the list can NOT be used in NMI handler. So code uses the list in NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Reviewed-by: Mathieu Desnoyers Cc: Andrew Morton Signed-off-by: Len Brown --- include/linux/llist.h | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig | 3 ++ lib/Makefile | 2 + lib/llist.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 260 insertions(+) create mode 100644 include/linux/llist.h create mode 100644 lib/llist.c diff --git a/include/linux/llist.h b/include/linux/llist.h new file mode 100644 index 00000000000..aa0c8b5b3cd --- /dev/null +++ b/include/linux/llist.h @@ -0,0 +1,126 @@ +#ifndef LLIST_H +#define LLIST_H +/* + * Lock-less NULL terminated single linked list + * + * If there are multiple producers and multiple consumers, llist_add + * can be used in producers and llist_del_all can be used in + * consumers. They can work simultaneously without lock. But + * llist_del_first can not be used here. Because llist_del_first + * depends on list->first->next does not changed if list->first is not + * changed during its operation, but llist_del_first, llist_add, + * llist_add (or llist_del_all, llist_add, llist_add) sequence in + * another consumer may violate that. + * + * If there are multiple producers and one consumer, llist_add can be + * used in producers and llist_del_all or llist_del_first can be used + * in the consumer. + * + * This can be summarized as follow: + * + * | add | del_first | del_all + * add | - | - | - + * del_first | | L | L + * del_all | | | - + * + * Where "-" stands for no lock is needed, while "L" stands for lock + * is needed. + * + * The list entries deleted via llist_del_all can be traversed with + * traversing function such as llist_for_each etc. But the list + * entries can not be traversed safely before deleted from the list. + * The order of deleted entries is from the newest to the oldest added + * one. If you want to traverse from the oldest to the newest, you + * must reverse the order by yourself before traversing. + * + * The basic atomic operation of this list is cmpxchg on long. On + * architectures that don't have NMI-safe cmpxchg implementation, the + * list can NOT be used in NMI handler. So code uses the list in NMI + * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. + */ + +struct llist_head { + struct llist_node *first; +}; + +struct llist_node { + struct llist_node *next; +}; + +#define LLIST_HEAD_INIT(name) { NULL } +#define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name) + +/** + * init_llist_head - initialize lock-less list head + * @head: the head for your lock-less list + */ +static inline void init_llist_head(struct llist_head *list) +{ + list->first = NULL; +} + +/** + * llist_entry - get the struct of this entry + * @ptr: the &struct llist_node pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the llist_node within the struct. + */ +#define llist_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * llist_for_each - iterate over some deleted entries of a lock-less list + * @pos: the &struct llist_node to use as a loop cursor + * @node: the first entry of deleted list entries + * + * In general, some entries of the lock-less list can be traversed + * safely only after being deleted from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each(pos, node) \ + for ((pos) = (node); pos; (pos) = (pos)->next) + +/** + * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type + * @pos: the type * to use as a loop cursor. + * @node: the fist entry of deleted list entries. + * @member: the name of the llist_node with the struct. + * + * In general, some entries of the lock-less list can be traversed + * safely only after being removed from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_entry(pos, node, member) \ + for ((pos) = llist_entry((node), typeof(*(pos)), member); \ + &(pos)->member != NULL; \ + (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) + +/** + * llist_empty - tests whether a lock-less list is empty + * @head: the list to test + * + * Not guaranteed to be accurate or up to date. Just a quick way to + * test whether the list is empty without deleting something from the + * list. + */ +static inline int llist_empty(const struct llist_head *head) +{ + return ACCESS_ONCE(head->first) == NULL; +} + +void llist_add(struct llist_node *new, struct llist_head *head); +void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, + struct llist_head *head); +struct llist_node *llist_del_first(struct llist_head *head); +struct llist_node *llist_del_all(struct llist_head *head); +#endif /* LLIST_H */ diff --git a/lib/Kconfig b/lib/Kconfig index 830181cc7a8..25c19678a30 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -262,4 +262,7 @@ config AVERAGE If unsure, say N. +config LLIST + bool + endmenu diff --git a/lib/Makefile b/lib/Makefile index 6b597fdb189..d770b817202 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -112,6 +112,8 @@ obj-$(CONFIG_AVERAGE) += average.o obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o +obj-$(CONFIG_LLIST) += llist.o + hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/lib/llist.c b/lib/llist.c new file mode 100644 index 00000000000..da445724fa1 --- /dev/null +++ b/lib/llist.c @@ -0,0 +1,129 @@ +/* + * Lock-less NULL terminated single linked list + * + * The basic atomic operation of this list is cmpxchg on long. On + * architectures that don't have NMI-safe cmpxchg implementation, the + * list can NOT be used in NMI handler. So code uses the list in NMI + * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. + * + * Copyright 2010,2011 Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include + +#include + +/** + * llist_add - add a new entry + * @new: new entry to be added + * @head: the head for your lock-less list + */ +void llist_add(struct llist_node *new, struct llist_head *head) +{ + struct llist_node *entry, *old_entry; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + BUG_ON(in_nmi()); +#endif + + entry = head->first; + do { + old_entry = entry; + new->next = entry; + cpu_relax(); + } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry); +} +EXPORT_SYMBOL_GPL(llist_add); + +/** + * llist_add_batch - add several linked entries in batch + * @new_first: first entry in batch to be added + * @new_last: last entry in batch to be added + * @head: the head for your lock-less list + */ +void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, + struct llist_head *head) +{ + struct llist_node *entry, *old_entry; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + BUG_ON(in_nmi()); +#endif + + entry = head->first; + do { + old_entry = entry; + new_last->next = entry; + cpu_relax(); + } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry); +} +EXPORT_SYMBOL_GPL(llist_add_batch); + +/** + * llist_del_first - delete the first entry of lock-less list + * @head: the head for your lock-less list + * + * If list is empty, return NULL, otherwise, return the first entry + * deleted, this is the newest added one. + * + * Only one llist_del_first user can be used simultaneously with + * multiple llist_add users without lock. Because otherwise + * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, + * llist_add) sequence in another user may change @head->first->next, + * but keep @head->first. If multiple consumers are needed, please + * use llist_del_all or use lock between consumers. + */ +struct llist_node *llist_del_first(struct llist_head *head) +{ + struct llist_node *entry, *old_entry, *next; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + BUG_ON(in_nmi()); +#endif + + entry = head->first; + do { + if (entry == NULL) + return NULL; + old_entry = entry; + next = entry->next; + cpu_relax(); + } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry); + + return entry; +} +EXPORT_SYMBOL_GPL(llist_del_first); + +/** + * llist_del_all - delete all entries from lock-less list + * @head: the head of lock-less list to delete all entries + * + * If list is empty, return NULL, otherwise, delete all entries and + * return the pointer to the first entry. The order of entries + * deleted is from the newest to the oldest added one. + */ +struct llist_node *llist_del_all(struct llist_head *head) +{ +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + BUG_ON(in_nmi()); +#endif + + return xchg(&head->first, NULL); +} +EXPORT_SYMBOL_GPL(llist_del_all); -- cgit v1.2.3 From 7f184275aa306046fe7edcbef3229754f0d97402 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:24 +0800 Subject: lib, Make gen_pool memory allocator lockless This version of the gen_pool memory allocator supports lockless operation. This makes it safe to use in NMI handlers and other special unblockable contexts that could otherwise deadlock on locks. This is implemented by using atomic operations and retries on any conflicts. The disadvantage is that there may be livelocks in extreme cases. For better scalability, one gen_pool allocator can be used for each CPU. The lockless operation only works if there is enough memory available. If new memory is added to the pool a lock has to be still taken. So any user relying on locklessness has to ensure that sufficient memory is preallocated. The basic atomic operation of this allocator is cmpxchg on long. On architectures that don't have NMI-safe cmpxchg implementation, the allocator can NOT be used in NMI handler. So code uses the allocator in NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Reviewed-by: Mathieu Desnoyers Cc: Andrew Morton Signed-off-by: Len Brown --- include/linux/bitmap.h | 1 + include/linux/genalloc.h | 34 +++++- lib/bitmap.c | 2 - lib/genalloc.c | 300 ++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 272 insertions(+), 65 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index dcafe0bf000..907dd58aa22 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -145,6 +145,7 @@ extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) #define BITMAP_LAST_WORD_MASK(nbits) \ ( \ ((nbits) % BITS_PER_LONG) ? \ diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 5bbebda78b0..5e98eeb2af3 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -1,8 +1,26 @@ /* - * Basic general purpose allocator for managing special purpose memory - * not managed by the regular kmalloc/kfree interface. - * Uses for this includes on-device special memory, uncached memory - * etc. + * Basic general purpose allocator for managing special purpose + * memory, for example, memory that is not managed by the regular + * kmalloc/kfree interface. Uses for this includes on-device special + * memory, uncached memory etc. + * + * It is safe to use the allocator in NMI handlers and other special + * unblockable contexts that could otherwise deadlock on locks. This + * is implemented by using atomic operations and retries on any + * conflicts. The disadvantage is that there may be livelocks in + * extreme cases. For better scalability, one allocator can be used + * for each CPU. + * + * The lockless operation only works if there is enough memory + * available. If new memory is added to the pool a lock has to be + * still taken. So any user relying on locklessness has to ensure + * that sufficient memory is preallocated. + * + * The basic atomic operation of this allocator is cmpxchg on long. + * On architectures that don't have NMI-safe cmpxchg implementation, + * the allocator can NOT be used in NMI handler. So code uses the + * allocator in NMI handler should depend on + * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. @@ -15,7 +33,7 @@ * General purpose special memory pool descriptor. */ struct gen_pool { - rwlock_t lock; + spinlock_t lock; struct list_head chunks; /* list of chunks in this pool */ int min_alloc_order; /* minimum allocation order */ }; @@ -24,8 +42,8 @@ struct gen_pool { * General purpose special memory pool chunk descriptor. */ struct gen_pool_chunk { - spinlock_t lock; struct list_head next_chunk; /* next chunk in pool */ + atomic_t avail; phys_addr_t phys_addr; /* physical starting address of memory chunk */ unsigned long start_addr; /* starting address of memory chunk */ unsigned long end_addr; /* ending address of memory chunk */ @@ -56,4 +74,8 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr, extern void gen_pool_destroy(struct gen_pool *); extern unsigned long gen_pool_alloc(struct gen_pool *, size_t); extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); +extern void gen_pool_for_each_chunk(struct gen_pool *, + void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *); +extern size_t gen_pool_avail(struct gen_pool *); +extern size_t gen_pool_size(struct gen_pool *); #endif /* __GENALLOC_H__ */ diff --git a/lib/bitmap.c b/lib/bitmap.c index 3f3b68199d7..e3c9e999501 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -271,8 +271,6 @@ int __bitmap_weight(const unsigned long *bitmap, int bits) } EXPORT_SYMBOL(__bitmap_weight); -#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) - void bitmap_set(unsigned long *map, int start, int nr) { unsigned long *p = map + BIT_WORD(start); diff --git a/lib/genalloc.c b/lib/genalloc.c index 577ddf80597..f352cc42f4f 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -1,8 +1,26 @@ /* - * Basic general purpose allocator for managing special purpose memory - * not managed by the regular kmalloc/kfree interface. - * Uses for this includes on-device special memory, uncached memory - * etc. + * Basic general purpose allocator for managing special purpose + * memory, for example, memory that is not managed by the regular + * kmalloc/kfree interface. Uses for this includes on-device special + * memory, uncached memory etc. + * + * It is safe to use the allocator in NMI handlers and other special + * unblockable contexts that could otherwise deadlock on locks. This + * is implemented by using atomic operations and retries on any + * conflicts. The disadvantage is that there may be livelocks in + * extreme cases. For better scalability, one allocator can be used + * for each CPU. + * + * The lockless operation only works if there is enough memory + * available. If new memory is added to the pool a lock has to be + * still taken. So any user relying on locklessness has to ensure + * that sufficient memory is preallocated. + * + * The basic atomic operation of this allocator is cmpxchg on long. + * On architectures that don't have NMI-safe cmpxchg implementation, + * the allocator can NOT be used in NMI handler. So code uses the + * allocator in NMI handler should depend on + * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2005 (C) Jes Sorensen * @@ -13,8 +31,109 @@ #include #include #include +#include +#include #include +static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) +{ + unsigned long val, nval; + + nval = *addr; + do { + val = nval; + if (val & mask_to_set) + return -EBUSY; + cpu_relax(); + } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val); + + return 0; +} + +static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) +{ + unsigned long val, nval; + + nval = *addr; + do { + val = nval; + if ((val & mask_to_clear) != mask_to_clear) + return -EBUSY; + cpu_relax(); + } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val); + + return 0; +} + +/* + * bitmap_set_ll - set the specified number of bits at the specified position + * @map: pointer to a bitmap + * @start: a bit position in @map + * @nr: number of bits to set + * + * Set @nr bits start from @start in @map lock-lessly. Several users + * can set/clear the same bitmap simultaneously without lock. If two + * users set the same bit, one user will return remain bits, otherwise + * return 0. + */ +static int bitmap_set_ll(unsigned long *map, int start, int nr) +{ + unsigned long *p = map + BIT_WORD(start); + const int size = start + nr; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + while (nr - bits_to_set >= 0) { + if (set_bits_ll(p, mask_to_set)) + return nr; + nr -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + if (nr) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + if (set_bits_ll(p, mask_to_set)) + return nr; + } + + return 0; +} + +/* + * bitmap_clear_ll - clear the specified number of bits at the specified position + * @map: pointer to a bitmap + * @start: a bit position in @map + * @nr: number of bits to set + * + * Clear @nr bits start from @start in @map lock-lessly. Several users + * can set/clear the same bitmap simultaneously without lock. If two + * users clear the same bit, one user will return remain bits, + * otherwise return 0. + */ +static int bitmap_clear_ll(unsigned long *map, int start, int nr) +{ + unsigned long *p = map + BIT_WORD(start); + const int size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (nr - bits_to_clear >= 0) { + if (clear_bits_ll(p, mask_to_clear)) + return nr; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + if (clear_bits_ll(p, mask_to_clear)) + return nr; + } + + return 0; +} /** * gen_pool_create - create a new special memory pool @@ -30,7 +149,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid) pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid); if (pool != NULL) { - rwlock_init(&pool->lock); + spin_lock_init(&pool->lock); INIT_LIST_HEAD(&pool->chunks); pool->min_alloc_order = min_alloc_order; } @@ -63,14 +182,14 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy if (unlikely(chunk == NULL)) return -ENOMEM; - spin_lock_init(&chunk->lock); chunk->phys_addr = phys; chunk->start_addr = virt; chunk->end_addr = virt + size; + atomic_set(&chunk->avail, size); - write_lock(&pool->lock); - list_add(&chunk->next_chunk, &pool->chunks); - write_unlock(&pool->lock); + spin_lock(&pool->lock); + list_add_rcu(&chunk->next_chunk, &pool->chunks); + spin_unlock(&pool->lock); return 0; } @@ -85,19 +204,19 @@ EXPORT_SYMBOL(gen_pool_add_virt); */ phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) { - struct list_head *_chunk; struct gen_pool_chunk *chunk; + phys_addr_t paddr = -1; - read_lock(&pool->lock); - list_for_each(_chunk, &pool->chunks) { - chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); - - if (addr >= chunk->start_addr && addr < chunk->end_addr) - return chunk->phys_addr + addr - chunk->start_addr; + rcu_read_lock(); + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { + if (addr >= chunk->start_addr && addr < chunk->end_addr) { + paddr = chunk->phys_addr + (addr - chunk->start_addr); + break; + } } - read_unlock(&pool->lock); + rcu_read_unlock(); - return -1; + return paddr; } EXPORT_SYMBOL(gen_pool_virt_to_phys); @@ -115,7 +234,6 @@ void gen_pool_destroy(struct gen_pool *pool) int order = pool->min_alloc_order; int bit, end_bit; - list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); list_del(&chunk->next_chunk); @@ -137,44 +255,50 @@ EXPORT_SYMBOL(gen_pool_destroy); * @size: number of bytes to allocate from the pool * * Allocate the requested number of bytes from the specified pool. - * Uses a first-fit algorithm. + * Uses a first-fit algorithm. Can not be used in NMI handler on + * architectures without NMI-safe cmpxchg implementation. */ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) { - struct list_head *_chunk; struct gen_pool_chunk *chunk; - unsigned long addr, flags; + unsigned long addr = 0; int order = pool->min_alloc_order; - int nbits, start_bit, end_bit; + int nbits, start_bit = 0, end_bit, remain; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + BUG_ON(in_nmi()); +#endif if (size == 0) return 0; nbits = (size + (1UL << order) - 1) >> order; - - read_lock(&pool->lock); - list_for_each(_chunk, &pool->chunks) { - chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); + rcu_read_lock(); + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { + if (size > atomic_read(&chunk->avail)) + continue; end_bit = (chunk->end_addr - chunk->start_addr) >> order; - - spin_lock_irqsave(&chunk->lock, flags); - start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0, - nbits, 0); - if (start_bit >= end_bit) { - spin_unlock_irqrestore(&chunk->lock, flags); +retry: + start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, + start_bit, nbits, 0); + if (start_bit >= end_bit) continue; + remain = bitmap_set_ll(chunk->bits, start_bit, nbits); + if (remain) { + remain = bitmap_clear_ll(chunk->bits, start_bit, + nbits - remain); + BUG_ON(remain); + goto retry; } addr = chunk->start_addr + ((unsigned long)start_bit << order); - - bitmap_set(chunk->bits, start_bit, nbits); - spin_unlock_irqrestore(&chunk->lock, flags); - read_unlock(&pool->lock); - return addr; + size = nbits << order; + atomic_sub(size, &chunk->avail); + break; } - read_unlock(&pool->lock); - return 0; + rcu_read_unlock(); + return addr; } EXPORT_SYMBOL(gen_pool_alloc); @@ -184,33 +308,95 @@ EXPORT_SYMBOL(gen_pool_alloc); * @addr: starting address of memory to free back to pool * @size: size in bytes of memory to free * - * Free previously allocated special memory back to the specified pool. + * Free previously allocated special memory back to the specified + * pool. Can not be used in NMI handler on architectures without + * NMI-safe cmpxchg implementation. */ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) { - struct list_head *_chunk; struct gen_pool_chunk *chunk; - unsigned long flags; int order = pool->min_alloc_order; - int bit, nbits; + int start_bit, nbits, remain; - nbits = (size + (1UL << order) - 1) >> order; - - read_lock(&pool->lock); - list_for_each(_chunk, &pool->chunks) { - chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + BUG_ON(in_nmi()); +#endif + nbits = (size + (1UL << order) - 1) >> order; + rcu_read_lock(); + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { if (addr >= chunk->start_addr && addr < chunk->end_addr) { BUG_ON(addr + size > chunk->end_addr); - spin_lock_irqsave(&chunk->lock, flags); - bit = (addr - chunk->start_addr) >> order; - while (nbits--) - __clear_bit(bit++, chunk->bits); - spin_unlock_irqrestore(&chunk->lock, flags); - break; + start_bit = (addr - chunk->start_addr) >> order; + remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); + BUG_ON(remain); + size = nbits << order; + atomic_add(size, &chunk->avail); + rcu_read_unlock(); + return; } } - BUG_ON(nbits > 0); - read_unlock(&pool->lock); + rcu_read_unlock(); + BUG(); } EXPORT_SYMBOL(gen_pool_free); + +/** + * gen_pool_for_each_chunk - call func for every chunk of generic memory pool + * @pool: the generic memory pool + * @func: func to call + * @data: additional data used by @func + * + * Call @func for every chunk of generic memory pool. The @func is + * called with rcu_read_lock held. + */ +void gen_pool_for_each_chunk(struct gen_pool *pool, + void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data), + void *data) +{ + struct gen_pool_chunk *chunk; + + rcu_read_lock(); + list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) + func(pool, chunk, data); + rcu_read_unlock(); +} +EXPORT_SYMBOL(gen_pool_for_each_chunk); + +/** + * gen_pool_avail - get available free space of the pool + * @pool: pool to get available free space + * + * Return available free space of the specified pool. + */ +size_t gen_pool_avail(struct gen_pool *pool) +{ + struct gen_pool_chunk *chunk; + size_t avail = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) + avail += atomic_read(&chunk->avail); + rcu_read_unlock(); + return avail; +} +EXPORT_SYMBOL_GPL(gen_pool_avail); + +/** + * gen_pool_size - get size in bytes of memory managed by the pool + * @pool: pool to get size + * + * Return size in bytes of memory managed by the pool. + */ +size_t gen_pool_size(struct gen_pool *pool) +{ + struct gen_pool_chunk *chunk; + size_t size = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) + size += chunk->end_addr - chunk->start_addr; + rcu_read_unlock(); + return size; +} +EXPORT_SYMBOL_GPL(gen_pool_size); -- cgit v1.2.3 From 67eb2e99076708cc790019a6a08ca3e0ae130a3a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:25 +0800 Subject: ACPI, APEI, GHES, printk support for recoverable error via NMI Some APEI GHES recoverable errors are reported via NMI, but printk is not safe in NMI context. To solve the issue, a lock-less memory allocator is used to allocate memory in NMI handler, save the error record into the allocated memory, put the error record into a lock-less list. On the other hand, an irq_work is used to delay the operation from NMI context to IRQ context. The irq_work IRQ handler will remove nodes from lock-less list, printk the error record and do some further processing include recovery operation, then free the memory. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/Kconfig | 2 + drivers/acpi/apei/ghes.c | 209 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 193 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index 3f45dde17ae..35596eaaca1 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig @@ -13,6 +13,8 @@ config ACPI_APEI_GHES bool "APEI Generic Hardware Error Source" depends on ACPI_APEI && X86 select ACPI_HED + select LLIST + select GENERIC_ALLOCATOR help Generic Hardware Error Source provides a way to report platform hardware errors (such as that from chipset). It diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index b1390a61cde..d1a40218e17 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -12,7 +12,7 @@ * For more information about Generic Hardware Error Source, please * refer to ACPI Specification version 4.0, section 17.3.2.6 * - * Copyright 2010 Intel Corp. + * Copyright 2010,2011 Intel Corp. * Author: Huang Ying * * This program is free software; you can redistribute it and/or @@ -42,6 +42,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -53,6 +56,15 @@ #define GHES_PFX "GHES: " #define GHES_ESTATUS_MAX_SIZE 65536 +#define GHES_ESOURCE_PREALLOC_MAX_SIZE 65536 + +#define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3 + +#define GHES_ESTATUS_NODE_LEN(estatus_len) \ + (sizeof(struct ghes_estatus_node) + (estatus_len)) +#define GHES_ESTATUS_FROM_NODE(estatus_node) \ + ((struct acpi_hest_generic_status *) \ + ((struct ghes_estatus_node *)(estatus_node) + 1)) /* * One struct ghes is created for each generic hardware error source. @@ -77,6 +89,11 @@ struct ghes { }; }; +struct ghes_estatus_node { + struct llist_node llnode; + struct acpi_hest_generic *generic; +}; + int ghes_disable; module_param_named(disable, ghes_disable, bool, 0); @@ -124,6 +141,19 @@ static struct vm_struct *ghes_ioremap_area; static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); +/* + * printk is not safe in NMI context. So in NMI handler, we allocate + * required memory from lock-less memory allocator + * (ghes_estatus_pool), save estatus into it, put them into lock-less + * list (ghes_estatus_llist), then delay printk into IRQ context via + * irq_work (ghes_proc_irq_work). ghes_estatus_size_request record + * required pool size by all NMI error source. + */ +static struct gen_pool *ghes_estatus_pool; +static unsigned long ghes_estatus_pool_size_request; +static struct llist_head ghes_estatus_llist; +static struct irq_work ghes_proc_irq_work; + static int ghes_ioremap_init(void) { ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, @@ -183,6 +213,55 @@ static void ghes_iounmap_irq(void __iomem *vaddr_ptr) __flush_tlb_one(vaddr); } +static int ghes_estatus_pool_init(void) +{ + ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1); + if (!ghes_estatus_pool) + return -ENOMEM; + return 0; +} + +static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool, + struct gen_pool_chunk *chunk, + void *data) +{ + free_page(chunk->start_addr); +} + +static void ghes_estatus_pool_exit(void) +{ + gen_pool_for_each_chunk(ghes_estatus_pool, + ghes_estatus_pool_free_chunk_page, NULL); + gen_pool_destroy(ghes_estatus_pool); +} + +static int ghes_estatus_pool_expand(unsigned long len) +{ + unsigned long i, pages, size, addr; + int ret; + + ghes_estatus_pool_size_request += PAGE_ALIGN(len); + size = gen_pool_size(ghes_estatus_pool); + if (size >= ghes_estatus_pool_size_request) + return 0; + pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE; + for (i = 0; i < pages; i++) { + addr = __get_free_page(GFP_KERNEL); + if (!addr) + return -ENOMEM; + ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1); + if (ret) + return ret; + } + + return 0; +} + +static void ghes_estatus_pool_shrink(unsigned long len) +{ + ghes_estatus_pool_size_request -= PAGE_ALIGN(len); +} + static struct ghes *ghes_new(struct acpi_hest_generic *generic) { struct ghes *ghes; @@ -344,13 +423,13 @@ static void ghes_clear_estatus(struct ghes *ghes) ghes->flags &= ~GHES_TO_CLEAR; } -static void ghes_do_proc(struct ghes *ghes) +static void ghes_do_proc(const struct acpi_hest_generic_status *estatus) { int sev, processed = 0; struct acpi_hest_generic_data *gdata; - sev = ghes_severity(ghes->estatus->error_severity); - apei_estatus_for_each_section(ghes->estatus, gdata) { + sev = ghes_severity(estatus->error_severity); + apei_estatus_for_each_section(estatus, gdata) { #ifdef CONFIG_X86_MCE if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, CPER_SEC_PLATFORM_MEM)) { @@ -363,27 +442,37 @@ static void ghes_do_proc(struct ghes *ghes) } } -static void __ghes_print_estatus(const char *pfx, struct ghes *ghes) +static void __ghes_print_estatus(const char *pfx, + const struct acpi_hest_generic *generic, + const struct acpi_hest_generic_status *estatus) { if (pfx == NULL) { - if (ghes_severity(ghes->estatus->error_severity) <= + if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED) pfx = KERN_WARNING HW_ERR; else pfx = KERN_ERR HW_ERR; } printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n", - pfx, ghes->generic->header.source_id); - apei_estatus_print(pfx, ghes->estatus); + pfx, generic->header.source_id); + apei_estatus_print(pfx, estatus); } -static void ghes_print_estatus(const char *pfx, struct ghes *ghes) +static void ghes_print_estatus(const char *pfx, + const struct acpi_hest_generic *generic, + const struct acpi_hest_generic_status *estatus) { /* Not more than 2 messages every 5 seconds */ - static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); + static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2); + static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2); + struct ratelimit_state *ratelimit; - if (__ratelimit(&ratelimit)) - __ghes_print_estatus(pfx, ghes); + if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED) + ratelimit = &ratelimit_corrected; + else + ratelimit = &ratelimit_uncorrected; + if (__ratelimit(ratelimit)) + __ghes_print_estatus(pfx, generic, estatus); } static int ghes_proc(struct ghes *ghes) @@ -393,8 +482,8 @@ static int ghes_proc(struct ghes *ghes) rc = ghes_read_estatus(ghes, 0); if (rc) goto out; - ghes_print_estatus(NULL, ghes); - ghes_do_proc(ghes); + ghes_print_estatus(NULL, ghes->generic, ghes->estatus); + ghes_do_proc(ghes->estatus); out: ghes_clear_estatus(ghes); @@ -453,6 +542,40 @@ static int ghes_notify_sci(struct notifier_block *this, return ret; } +static void ghes_proc_in_irq(struct irq_work *irq_work) +{ + struct llist_node *llnode, *next, *tail = NULL; + struct ghes_estatus_node *estatus_node; + struct acpi_hest_generic_status *estatus; + u32 len, node_len; + + /* + * Because the time order of estatus in list is reversed, + * revert it back to proper order. + */ + llnode = llist_del_all(&ghes_estatus_llist); + while (llnode) { + next = llnode->next; + llnode->next = tail; + tail = llnode; + llnode = next; + } + llnode = tail; + while (llnode) { + next = llnode->next; + estatus_node = llist_entry(llnode, struct ghes_estatus_node, + llnode); + estatus = GHES_ESTATUS_FROM_NODE(estatus_node); + len = apei_estatus_len(estatus); + node_len = GHES_ESTATUS_NODE_LEN(len); + ghes_do_proc(estatus); + ghes_print_estatus(NULL, estatus_node->generic, estatus); + gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, + node_len); + llnode = next; + } +} + static int ghes_notify_nmi(struct notifier_block *this, unsigned long cmd, void *data) { @@ -482,7 +605,8 @@ static int ghes_notify_nmi(struct notifier_block *this, if (sev_global >= GHES_SEV_PANIC) { oops_begin(); - __ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); + __ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global->generic, + ghes_global->estatus); /* reboot to log the error! */ if (panic_timeout == 0) panic_timeout = ghes_panic_timeout; @@ -490,12 +614,31 @@ static int ghes_notify_nmi(struct notifier_block *this, } list_for_each_entry_rcu(ghes, &ghes_nmi, list) { +#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + u32 len, node_len; + struct ghes_estatus_node *estatus_node; + struct acpi_hest_generic_status *estatus; +#endif if (!(ghes->flags & GHES_TO_CLEAR)) continue; - /* Do not print estatus because printk is not NMI safe */ - ghes_do_proc(ghes); +#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + /* Save estatus for further processing in IRQ context */ + len = apei_estatus_len(ghes->estatus); + node_len = GHES_ESTATUS_NODE_LEN(len); + estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, + node_len); + if (estatus_node) { + estatus_node->generic = ghes->generic; + estatus = GHES_ESTATUS_FROM_NODE(estatus_node); + memcpy(estatus, ghes->estatus, len); + llist_add(&estatus_node->llnode, &ghes_estatus_llist); + } +#endif ghes_clear_estatus(ghes); } +#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + irq_work_queue(&ghes_proc_irq_work); +#endif out: raw_spin_unlock(&ghes_nmi_lock); @@ -510,10 +653,26 @@ static struct notifier_block ghes_notifier_nmi = { .notifier_call = ghes_notify_nmi, }; +static unsigned long ghes_esource_prealloc_size( + const struct acpi_hest_generic *generic) +{ + unsigned long block_length, prealloc_records, prealloc_size; + + block_length = min_t(unsigned long, generic->error_block_length, + GHES_ESTATUS_MAX_SIZE); + prealloc_records = max_t(unsigned long, + generic->records_to_preallocate, 1); + prealloc_size = min_t(unsigned long, block_length * prealloc_records, + GHES_ESOURCE_PREALLOC_MAX_SIZE); + + return prealloc_size; +} + static int __devinit ghes_probe(struct platform_device *ghes_dev) { struct acpi_hest_generic *generic; struct ghes *ghes = NULL; + unsigned long len; int rc = -EINVAL; generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; @@ -579,6 +738,8 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) mutex_unlock(&ghes_list_mutex); break; case ACPI_HEST_NOTIFY_NMI: + len = ghes_esource_prealloc_size(generic); + ghes_estatus_pool_expand(len); mutex_lock(&ghes_list_mutex); if (list_empty(&ghes_nmi)) register_die_notifier(&ghes_notifier_nmi); @@ -603,6 +764,7 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) { struct ghes *ghes; struct acpi_hest_generic *generic; + unsigned long len; ghes = platform_get_drvdata(ghes_dev); generic = ghes->generic; @@ -633,6 +795,8 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) * freed after NMI handler finishes. */ synchronize_rcu(); + len = ghes_esource_prealloc_size(generic); + ghes_estatus_pool_shrink(len); break; default: BUG(); @@ -673,14 +837,20 @@ static int __init ghes_init(void) return -EINVAL; } + init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); + rc = ghes_ioremap_init(); if (rc) goto err; - rc = platform_driver_register(&ghes_platform_driver); + rc = ghes_estatus_pool_init(); if (rc) goto err_ioremap_exit; + rc = platform_driver_register(&ghes_platform_driver); + if (rc) + goto err_pool_exit; + rc = apei_osc_setup(); if (rc == 0 && osc_sb_apei_support_acked) pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n"); @@ -692,6 +862,8 @@ static int __init ghes_init(void) pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); return 0; +err_pool_exit: + ghes_estatus_pool_exit(); err_ioremap_exit: ghes_ioremap_exit(); err: @@ -701,6 +873,7 @@ err: static void __exit ghes_exit(void) { platform_driver_unregister(&ghes_platform_driver); + ghes_estatus_pool_exit(); ghes_ioremap_exit(); } -- cgit v1.2.3 From 152cef40a808d3034e383465b3f7d6783613e458 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:26 +0800 Subject: ACPI, APEI, GHES, Error records content based throttle printk is used by GHES to report hardware errors. Ratelimit is enforced on the printk to avoid too many hardware error reports in kernel log. Because there may be thousands or even millions of corrected hardware errors during system running. Currently, a simple scheme is used. That is, the total number of hardware error reporting is ratelimited. This may cause some issues in practice. For example, there are two kinds of hardware errors occurred in system. One is corrected memory error, because the fault memory address is accessed frequently, there may be hundreds error report per-second. The other is corrected PCIe AER error, it will be reported once per-second. Because they share one ratelimit control structure, it is highly possible that only memory error is reported. To avoid the above issue, an error record content based throttle algorithm is implemented in the patch. Where after the first successful reporting, all error records that are same are throttled for some time, to let other kinds of error records have the opportunity to be reported. In above example, the memory errors will be throttled for some time, after being printked. Then the PCIe AER error will be printked successfully. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/ghes.c | 184 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 177 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d1a40218e17..931410d31a9 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -60,6 +60,21 @@ #define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3 +/* This is just an estimation for memory pool allocation */ +#define GHES_ESTATUS_CACHE_AVG_SIZE 512 + +#define GHES_ESTATUS_CACHES_SIZE 4 + +#define GHES_ESTATUS_IN_CACHE_MAX_NSEC (10 * NSEC_PER_SEC) +/* Prevent too many caches are allocated because of RCU */ +#define GHES_ESTATUS_CACHE_ALLOCED_MAX (GHES_ESTATUS_CACHES_SIZE * 3 / 2) + +#define GHES_ESTATUS_CACHE_LEN(estatus_len) \ + (sizeof(struct ghes_estatus_cache) + (estatus_len)) +#define GHES_ESTATUS_FROM_CACHE(estatus_cache) \ + ((struct acpi_hest_generic_status *) \ + ((struct ghes_estatus_cache *)(estatus_cache) + 1)) + #define GHES_ESTATUS_NODE_LEN(estatus_len) \ (sizeof(struct ghes_estatus_node) + (estatus_len)) #define GHES_ESTATUS_FROM_NODE(estatus_node) \ @@ -94,6 +109,14 @@ struct ghes_estatus_node { struct acpi_hest_generic *generic; }; +struct ghes_estatus_cache { + u32 estatus_len; + atomic_t count; + struct acpi_hest_generic *generic; + unsigned long long time_in; + struct rcu_head rcu; +}; + int ghes_disable; module_param_named(disable, ghes_disable, bool, 0); @@ -154,6 +177,9 @@ static unsigned long ghes_estatus_pool_size_request; static struct llist_head ghes_estatus_llist; static struct irq_work ghes_proc_irq_work; +struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; +static atomic_t ghes_estatus_cache_alloced; + static int ghes_ioremap_init(void) { ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, @@ -458,9 +484,9 @@ static void __ghes_print_estatus(const char *pfx, apei_estatus_print(pfx, estatus); } -static void ghes_print_estatus(const char *pfx, - const struct acpi_hest_generic *generic, - const struct acpi_hest_generic_status *estatus) +static int ghes_print_estatus(const char *pfx, + const struct acpi_hest_generic *generic, + const struct acpi_hest_generic_status *estatus) { /* Not more than 2 messages every 5 seconds */ static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2); @@ -471,8 +497,137 @@ static void ghes_print_estatus(const char *pfx, ratelimit = &ratelimit_corrected; else ratelimit = &ratelimit_uncorrected; - if (__ratelimit(ratelimit)) + if (__ratelimit(ratelimit)) { __ghes_print_estatus(pfx, generic, estatus); + return 1; + } + return 0; +} + +/* + * GHES error status reporting throttle, to report more kinds of + * errors, instead of just most frequently occurred errors. + */ +static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus) +{ + u32 len; + int i, cached = 0; + unsigned long long now; + struct ghes_estatus_cache *cache; + struct acpi_hest_generic_status *cache_estatus; + + len = apei_estatus_len(estatus); + rcu_read_lock(); + for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { + cache = rcu_dereference(ghes_estatus_caches[i]); + if (cache == NULL) + continue; + if (len != cache->estatus_len) + continue; + cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); + if (memcmp(estatus, cache_estatus, len)) + continue; + atomic_inc(&cache->count); + now = sched_clock(); + if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC) + cached = 1; + break; + } + rcu_read_unlock(); + return cached; +} + +static struct ghes_estatus_cache *ghes_estatus_cache_alloc( + struct acpi_hest_generic *generic, + struct acpi_hest_generic_status *estatus) +{ + int alloced; + u32 len, cache_len; + struct ghes_estatus_cache *cache; + struct acpi_hest_generic_status *cache_estatus; + + alloced = atomic_add_return(1, &ghes_estatus_cache_alloced); + if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) { + atomic_dec(&ghes_estatus_cache_alloced); + return NULL; + } + len = apei_estatus_len(estatus); + cache_len = GHES_ESTATUS_CACHE_LEN(len); + cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len); + if (!cache) { + atomic_dec(&ghes_estatus_cache_alloced); + return NULL; + } + cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); + memcpy(cache_estatus, estatus, len); + cache->estatus_len = len; + atomic_set(&cache->count, 0); + cache->generic = generic; + cache->time_in = sched_clock(); + return cache; +} + +static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache) +{ + u32 len; + + len = apei_estatus_len(GHES_ESTATUS_FROM_CACHE(cache)); + len = GHES_ESTATUS_CACHE_LEN(len); + gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len); + atomic_dec(&ghes_estatus_cache_alloced); +} + +static void ghes_estatus_cache_rcu_free(struct rcu_head *head) +{ + struct ghes_estatus_cache *cache; + + cache = container_of(head, struct ghes_estatus_cache, rcu); + ghes_estatus_cache_free(cache); +} + +static void ghes_estatus_cache_add( + struct acpi_hest_generic *generic, + struct acpi_hest_generic_status *estatus) +{ + int i, slot = -1, count; + unsigned long long now, duration, period, max_period = 0; + struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache; + + new_cache = ghes_estatus_cache_alloc(generic, estatus); + if (new_cache == NULL) + return; + rcu_read_lock(); + now = sched_clock(); + for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { + cache = rcu_dereference(ghes_estatus_caches[i]); + if (cache == NULL) { + slot = i; + slot_cache = NULL; + break; + } + duration = now - cache->time_in; + if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) { + slot = i; + slot_cache = cache; + break; + } + count = atomic_read(&cache->count); + period = duration / (count + 1); + if (period > max_period) { + max_period = period; + slot = i; + slot_cache = cache; + } + } + /* new_cache must be put into array after its contents are written */ + smp_wmb(); + if (slot != -1 && cmpxchg(ghes_estatus_caches + slot, + slot_cache, new_cache) == slot_cache) { + if (slot_cache) + call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free); + } else + ghes_estatus_cache_free(new_cache); + rcu_read_unlock(); } static int ghes_proc(struct ghes *ghes) @@ -482,9 +637,11 @@ static int ghes_proc(struct ghes *ghes) rc = ghes_read_estatus(ghes, 0); if (rc) goto out; - ghes_print_estatus(NULL, ghes->generic, ghes->estatus); + if (!ghes_estatus_cached(ghes->estatus)) { + if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus)) + ghes_estatus_cache_add(ghes->generic, ghes->estatus); + } ghes_do_proc(ghes->estatus); - out: ghes_clear_estatus(ghes); return 0; @@ -546,6 +703,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work) { struct llist_node *llnode, *next, *tail = NULL; struct ghes_estatus_node *estatus_node; + struct acpi_hest_generic *generic; struct acpi_hest_generic_status *estatus; u32 len, node_len; @@ -569,7 +727,11 @@ static void ghes_proc_in_irq(struct irq_work *irq_work) len = apei_estatus_len(estatus); node_len = GHES_ESTATUS_NODE_LEN(len); ghes_do_proc(estatus); - ghes_print_estatus(NULL, estatus_node->generic, estatus); + if (!ghes_estatus_cached(estatus)) { + generic = estatus_node->generic; + if (ghes_print_estatus(NULL, generic, estatus)) + ghes_estatus_cache_add(generic, estatus); + } gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len); llnode = next; @@ -622,6 +784,8 @@ static int ghes_notify_nmi(struct notifier_block *this, if (!(ghes->flags & GHES_TO_CLEAR)) continue; #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + if (ghes_estatus_cached(ghes->estatus)) + goto next; /* Save estatus for further processing in IRQ context */ len = apei_estatus_len(ghes->estatus); node_len = GHES_ESTATUS_NODE_LEN(len); @@ -633,6 +797,7 @@ static int ghes_notify_nmi(struct notifier_block *this, memcpy(estatus, ghes->estatus, len); llist_add(&estatus_node->llnode, &ghes_estatus_llist); } +next: #endif ghes_clear_estatus(ghes); } @@ -847,6 +1012,11 @@ static int __init ghes_init(void) if (rc) goto err_ioremap_exit; + rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE * + GHES_ESTATUS_CACHE_ALLOCED_MAX); + if (rc) + goto err_pool_exit; + rc = platform_driver_register(&ghes_platform_driver); if (rc) goto err_pool_exit; -- cgit v1.2.3 From ea8f5fb8a71fddaf5f3a17100d3247855701f732 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:27 +0800 Subject: HWPoison: add memory_failure_queue() memory_failure() is the entry point for HWPoison memory error recovery. It must be called in process context. But commonly hardware memory errors are notified via MCE or NMI, so some delayed execution mechanism must be used. In MCE handler, a work queue + ring buffer mechanism is used. In addition to MCE, now APEI (ACPI Platform Error Interface) GHES (Generic Hardware Error Source) can be used to report memory errors too. To add support to APEI GHES memory recovery, a mechanism similar to that of MCE is implemented. memory_failure_queue() is the new entry point that can be called in IRQ context. The next step is to make MCE handler uses this interface too. Signed-off-by: Huang Ying Cc: Andi Kleen Cc: Wu Fengguang Cc: Andrew Morton Signed-off-by: Len Brown --- include/linux/mm.h | 1 + mm/memory-failure.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9670f71d7be..303108eed28 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1633,6 +1633,7 @@ enum mf_flags { }; extern void memory_failure(unsigned long pfn, int trapno); extern int __memory_failure(unsigned long pfn, int trapno, int flags); +extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059..2b43ba051ac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno) __memory_failure(pfn, trapno, 0); } +#define MEMORY_FAILURE_FIFO_ORDER 4 +#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { + unsigned long pfn; + int trapno; + int flags; +}; + +struct memory_failure_cpu { + DECLARE_KFIFO(fifo, struct memory_failure_entry, + MEMORY_FAILURE_FIFO_SIZE); + spinlock_t lock; + struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); + +/** + * memory_failure_queue - Schedule handling memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: Flags for memory failure handling + * + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Can run in IRQ context. + */ +void memory_failure_queue(unsigned long pfn, int trapno, int flags) +{ + struct memory_failure_cpu *mf_cpu; + unsigned long proc_flags; + struct memory_failure_entry entry = { + .pfn = pfn, + .trapno = trapno, + .flags = flags, + }; + + mf_cpu = &get_cpu_var(memory_failure_cpu); + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + if (kfifo_put(&mf_cpu->fifo, &entry)) + schedule_work_on(smp_processor_id(), &mf_cpu->work); + else + pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", + pfn); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work) +{ + struct memory_failure_cpu *mf_cpu; + struct memory_failure_entry entry = { 0, }; + unsigned long proc_flags; + int gotten; + + mf_cpu = &__get_cpu_var(memory_failure_cpu); + for (;;) { + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + gotten = kfifo_get(&mf_cpu->fifo, &entry); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + if (!gotten) + break; + __memory_failure(entry.pfn, entry.trapno, entry.flags); + } +} + +static int __init memory_failure_init(void) +{ + struct memory_failure_cpu *mf_cpu; + int cpu; + + for_each_possible_cpu(cpu) { + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + spin_lock_init(&mf_cpu->lock); + INIT_KFIFO(mf_cpu->fifo); + INIT_WORK(&mf_cpu->work, memory_failure_work_func); + } + + return 0; +} +core_initcall(memory_failure_init); + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page -- cgit v1.2.3 From ba61ca4aab47441f1c6cec28a9a6aa0489fd1df3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:28 +0800 Subject: ACPI, APEI, GHES: Add hardware memory error recovery support memory_failure_queue() is called when recoverable memory errors are notified by firmware to do the recovery work. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- drivers/acpi/apei/Kconfig | 7 +++++++ drivers/acpi/apei/ghes.c | 24 +++++++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index 35596eaaca1..c34aa51af4e 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig @@ -32,6 +32,13 @@ config ACPI_APEI_PCIEAER PCIe AER errors may be reported via APEI firmware first mode. Turn on this option to enable the corresponding support. +config ACPI_APEI_MEMORY_FAILURE + bool "APEI memory error recovering support" + depends on ACPI_APEI && MEMORY_FAILURE + help + Memory errors may be reported via APEI firmware first mode. + Turn on this option to enable the memory recovering support. + config ACPI_APEI_EINJ tristate "APEI Error INJection (EINJ)" depends on ACPI_APEI && DEBUG_FS diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 931410d31a9..e92c47c46f9 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -451,20 +451,30 @@ static void ghes_clear_estatus(struct ghes *ghes) static void ghes_do_proc(const struct acpi_hest_generic_status *estatus) { - int sev, processed = 0; + int sev, sec_sev; struct acpi_hest_generic_data *gdata; sev = ghes_severity(estatus->error_severity); apei_estatus_for_each_section(estatus, gdata) { -#ifdef CONFIG_X86_MCE + sec_sev = ghes_severity(gdata->error_severity); if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, CPER_SEC_PLATFORM_MEM)) { - apei_mce_report_mem_error( - sev == GHES_SEV_CORRECTED, - (struct cper_sec_mem_err *)(gdata+1)); - processed = 1; - } + struct cper_sec_mem_err *mem_err; + mem_err = (struct cper_sec_mem_err *)(gdata+1); +#ifdef CONFIG_X86_MCE + apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, + mem_err); #endif +#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE + if (sev == GHES_SEV_RECOVERABLE && + sec_sev == GHES_SEV_RECOVERABLE && + mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { + unsigned long pfn; + pfn = mem_err->physical_addr >> PAGE_SHIFT; + memory_failure_queue(pfn, 0, 0); + } +#endif + } } } -- cgit v1.2.3 From a7e09d450b2e0b068e850d103b6ee1af537d1910 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 16 Jul 2011 18:14:21 -0400 Subject: ACPI: APEI build fix as GHES is optional... When # CONFIG_ACPI_APEI_GHES is not set: (.init.text+0x4c22): undefined reference to `ghes_disable' Reported-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Len Brown --- drivers/acpi/bus.c | 2 -- include/acpi/apei.h | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 43ce3b0c492..437ddbf0c49 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -544,10 +544,8 @@ static void acpi_bus_osc_support(void) capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT; #endif -#ifdef CONFIG_ACPI_APEI_GHES if (!ghes_disable) capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_APEI_SUPPORT; -#endif if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) { diff --git a/include/acpi/apei.h b/include/acpi/apei.h index d40bc5521fc..51a527d24a8 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -18,7 +18,11 @@ extern int hest_disable; extern int erst_disable; +#ifdef CONFIG_ACPI_APEI_GHES extern int ghes_disable; +#else +#define ghes_disable 1 +#endif #ifdef CONFIG_ACPI_APEI void __init acpi_hest_init(void); -- cgit v1.2.3 From 70cb6e1da00db6c9212e6fd69bd96fd41c797077 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 2 Aug 2011 18:00:21 -0400 Subject: APEI GHES: 32-bit buildfix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/acpi/apei/ghes.c:542: warning: integer overflow in expression drivers/acpi/apei/ghes.c:619: warning: integer overflow in expression ghes.c:(.text+0x46289): undefined reference to `__udivdi3'   in function ghes_estatus_cache_add(). Reported-by: Randy Dunlap Signed-off-by: Len Brown --- drivers/acpi/apei/ghes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index e92c47c46f9..0784f99a466 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -65,7 +65,7 @@ #define GHES_ESTATUS_CACHES_SIZE 4 -#define GHES_ESTATUS_IN_CACHE_MAX_NSEC (10 * NSEC_PER_SEC) +#define GHES_ESTATUS_IN_CACHE_MAX_NSEC 10000000000ULL /* Prevent too many caches are allocated because of RCU */ #define GHES_ESTATUS_CACHE_ALLOCED_MAX (GHES_ESTATUS_CACHES_SIZE * 3 / 2) @@ -622,7 +622,8 @@ static void ghes_estatus_cache_add( break; } count = atomic_read(&cache->count); - period = duration / (count + 1); + period = duration; + do_div(period, (count + 1)); if (period > max_period) { max_period = period; slot = i; -- cgit v1.2.3 From c3e6088e1036f8084bc7444b38437da136b7588b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 20 Jul 2011 16:09:29 +0800 Subject: ACPI, APEI, EINJ Param support is disabled by default EINJ parameter support is only usable for some specific BIOS. Originally, it is expected to have no harm for BIOS does not support it. But now, we found it will cause issue (memory overwriting) for some BIOS. So param support is disabled by default and only enabled when newly added module parameter named "param_extension" is explicitly specified. Signed-off-by: Huang Ying Cc: Matthew Garrett Acked-by: Don Zickus Acked-by: Tony Luck Signed-off-by: Len Brown --- Documentation/acpi/apei/einj.txt | 11 +++++++++-- drivers/acpi/apei/einj.c | 39 ++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/acpi/apei/einj.txt index dfab71848dc..5cc699ba545 100644 --- a/Documentation/acpi/apei/einj.txt +++ b/Documentation/acpi/apei/einj.txt @@ -48,12 +48,19 @@ directory apei/einj. The following files are provided. - param1 This file is used to set the first error parameter value. Effect of parameter depends on error_type specified. For memory error, this is - physical memory address. + physical memory address. Only available if param_extension module + parameter is specified. - param2 This file is used to set the second error parameter value. Effect of parameter depends on error_type specified. For memory error, this is - physical memory address mask. + physical memory address mask. Only available if param_extension + module parameter is specified. + +Injecting parameter support is a BIOS version specific extension, that +is, it only works on some BIOS version. If you want to use it, please +make sure your BIOS version has the proper support and specify +"param_extension=y" in module parameter. For more information about EINJ, please refer to ACPI specification version 4.0, section 17.5. diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index ab821f1cfa1..589b96c3870 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -46,7 +46,8 @@ * Some BIOSes allow parameters to the SET_ERROR_TYPE entries in the * EINJ table through an unpublished extension. Use with caution as * most will ignore the parameter and make their own choice of address - * for error injection. + * for error injection. This extension is used only if + * param_extension module parameter is specified. */ struct einj_parameter { u64 type; @@ -65,6 +66,9 @@ struct einj_parameter { ((struct acpi_whea_header *)((char *)(tab) + \ sizeof(struct acpi_table_einj))) +static bool param_extension; +module_param(param_extension, bool, 0); + static struct acpi_table_einj *einj_tab; static struct apei_resources einj_resources; @@ -489,14 +493,6 @@ static int __init einj_init(void) einj_debug_dir, NULL, &error_type_fops); if (!fentry) goto err_cleanup; - fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, - einj_debug_dir, &error_param1); - if (!fentry) - goto err_cleanup; - fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR, - einj_debug_dir, &error_param2); - if (!fentry) - goto err_cleanup; fentry = debugfs_create_file("error_inject", S_IWUSR, einj_debug_dir, NULL, &error_inject_fops); if (!fentry) @@ -513,12 +509,23 @@ static int __init einj_init(void) rc = apei_exec_pre_map_gars(&ctx); if (rc) goto err_release; - param_paddr = einj_get_parameter_address(); - if (param_paddr) { - einj_param = ioremap(param_paddr, sizeof(*einj_param)); - rc = -ENOMEM; - if (!einj_param) - goto err_unmap; + if (param_extension) { + param_paddr = einj_get_parameter_address(); + if (param_paddr) { + einj_param = ioremap(param_paddr, sizeof(*einj_param)); + rc = -ENOMEM; + if (!einj_param) + goto err_unmap; + fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, + einj_debug_dir, &error_param1); + if (!fentry) + goto err_unmap; + fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR, + einj_debug_dir, &error_param2); + if (!fentry) + goto err_unmap; + } else + pr_warn(EINJ_PFX "Parameter extension is not supported.\n"); } pr_info(EINJ_PFX "Error INJection is initialized.\n"); @@ -526,6 +533,8 @@ static int __init einj_init(void) return 0; err_unmap: + if (einj_param) + iounmap(einj_param); apei_exec_post_unmap_gars(&ctx); err_release: apei_resources_release(&einj_resources); -- cgit v1.2.3