Linux Kernel WARN()&BUG(), Oops&Panic, Tainted分析
稳定性范畴, 参考5.x kernel。
kernel Oops
Oops指的就是内核的不正确行为,比如对驱动来说:
static int i82092aa_pci_probe(struct pci_dev *dev,
const struct pci_device_id *id)
{
unsigned char configbyte;
int i, ret;
ret = pci_enable_device(dev);
if (ret)
return ret;
/* PCI Configuration Control */
pci_read_config_byte(dev, 0x40, &configbyte);
switch (configbyte&6) {
case 0:
socket_count = 2;
break;
case 2:
socket_count = 1;
break;
case 4:
case 6:
socket_count = 4;
break;
default:
dev_err(&dev->dev,
"Oops, you did something we didn't think of.\n");
ret = -EIO;
goto err_out_disable;
}
这里的PCI配置读出来有异常,我们就认为他是一个Oops,打印一个错误,探测失败。
分配内存失败也算一种Oops,只不过不需要打出错误信息。
td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC);
fs_dprintk (FS_DEBUG_ALLOC, "Alloc transd: %p(%zd)\n", td, sizeof (struct FS_BPENTRY));
if (!td) {
/* Oops out of mem */
return -ENOMEM;
}
在体系架构方面的Oops,比如arm64的bug Oops:
static int bug_handler(struct pt_regs *regs, unsigned int esr)
{
switch (report_bug(regs->pc, regs)) {
case BUG_TRAP_TYPE_BUG:
die("Oops - BUG", regs, 0);
break;
如果report_bug()
返回的是BUG_TRAP_TYPE_BUG
,那就报个Oops log。
再比如非法访问也会走die("Oops", )
:
static void die_kernel_fault(const char *msg, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
addr);
mem_abort_decode(esr);
show_pte(addr);
die("Oops", regs, esr); //tj
bust_spinlocks(0);
do_exit(SIGKILL);
}
看下die()
:
void die(const char *str, struct pt_regs *regs, int err)
{
int ret;
unsigned long flags;
raw_spin_lock_irqsave(&die_lock, flags);
oops_enter();
console_verbose();
bust_spinlocks(1);
ret = __die(str, err, regs); //tj
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
oops_exit();
if (in_interrupt())
panic("Fatal exception in interrupt"); //tj
if (panic_on_oops)
panic("Fatal exception"); //tj
raw_spin_unlock_irqrestore(&die_lock, flags);
if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}
再看下__die()
:
static int __die(const char *str, int err, struct pt_regs *regs)
{
static int die_counter;
int ret;
pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n",
str, err, ++die_counter);
/* trap and error numbers are mostly meaningless on ARM */
ret = notify_die(DIE_OOPS, str, regs, err, 0, SIGSEGV);
if (ret == NOTIFY_STOP)
return ret;
print_modules();
show_regs(regs);
dump_kernel_instr(KERN_EMERG, regs);
return ret;
}
打印类似如下log:
35.449887: <6> Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
35.449893: <6> Modules linked in:
35.449901: <6> Process init (pid: 1, stack limit = 0x00000000826895f7)
后面会call panic()
,不过是有条件的:
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
如果这个Oops在中断里,会走panic()
。如果不在但if (panic_on_oops)
成立,也走panic()
。
可见,Oops不一定会导致panic。bug_handle()
对BUG_TRAP_TYPE_BUG
还不默认panic?
btw: arm64的Oops是怎么触发的了?稍后看。
Kernel panic
kernel panic就是不可恢复的错误了,怎么处理?我想复位or我就想定这。
/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups.
*
* This function never returns.
*/
void panic(const char *fmt, ...)
{
...
pr_emerg("Kernel panic - not syncing: %s\n", buf);
...
if (panic_timeout > 0) { //tj: 延迟重启
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
pr_emerg("Rebooting in %d seconds..\n", panic_timeout); //tj
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
}
if (panic_timeout != 0) { //tj: 立即reboot
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
if (panic_reboot_mode != REBOOT_UNDEFINED)
reboot_mode = panic_reboot_mode;
emergency_restart();
}
...
pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); //tj: 一直卡这
/* Do not scroll important messages printed above */
suppress_printk = 1;
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
config PANIC_TIMEOUT
int "panic timeout"
default 0
help
Set the timeout value (in seconds) until a reboot occurs when the
the kernel panics. If n = 0, then we wait forever. A timeout
value n > 0 will wait n seconds before rebooting, while a timeout
value n < 0 will reboot immediately.
这个panic timeout在Kconfig里说的很清楚。
BUG() ifndef HAVE_ARCH_BUG
先看代码注释:
/*
* Don't use BUG() or BUG_ON() unless there's really no way out; one
* example might be detecting data structure corruption in the middle
* of an operation that can't be backed out of. If the (sub)system
* can somehow continue operating, perhaps with reduced functionality,
* it's probably not BUG-worthy.
*
* If you're tempted to BUG(), think again: is completely giving up
* really the *only* solution? There are usually better options, where
* users don't need to reboot ASAP and can mostly shut down cleanly.
*/
#ifndef HAVE_ARCH_BUG
#define BUG() do { \
printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \
barrier_before_unreachable(); \
panic("BUG!"); \
} while (0)
#endif
无路可走了?那就call me。如果只是丢失功能,系统还能继续跑,那就不算BUG。我就call你debug不行啊:)
这里有个宏HAVE_ARCH_BUG
,没有实现arch bug,那就用它了,会直接导致panic,这是bug嘛,当然panic,感觉哪里不对劲?
BUG() on arm64
看下arm64的实现:
#define __BUG_FLAGS(flags) \
asm volatile (__stringify(ASM_BUG_FLAGS(flags)));
#define BUG() do { \
__BUG_FLAGS(0); \
unreachable(); \
} while (0)
#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
#define HAVE_ARCH_BUG //tj: define
#include <asm-generic/bug.h>
#ifdef CONFIG_GENERIC_BUG
#define __BUG_ENTRY(flags) \
.pushsection __bug_table,"aw"; \
.align 2; \
14470: .long 14471f - 14470b; \
_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \
.short flags; \
.popsection; \
14471:
#else
#define ASM_BUG_FLAGS(flags) \
__BUG_ENTRY(flags) \
brk BUG_BRK_IMM
/*
* #imm16 values used for BRK instruction generation
* ...
* 0x800: kernel-mode BUG() and WARN() traps
* ...
*/
#define BUG_BRK_IMM 0x800
arm64的BUG()
就是抛个brk 0x800
指令,注释也写明了。
bug_handler()
就是对应这个的处理。
static int bug_handler(struct pt_regs *regs, unsigned int esr)
{
switch (report_bug(regs->pc, regs)) { //tj: report_bug()
case BUG_TRAP_TYPE_BUG:
die("Oops - BUG", regs, 0);
break;
case BUG_TRAP_TYPE_WARN:
break;
default:
/* unknown/unrecognised bug trap type */
return DBG_HOOK_ERROR;
}
/* If thread survives, skip over the BUG instruction and continue: */
arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
return DBG_HOOK_HANDLED;
}
static struct break_hook bug_break_hook = {
.fn = bug_handler,
.imm = BUG_BRK_IMM,
};
report bug:
enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
{
...
if (!is_valid_bugaddr(bugaddr))
return BUG_TRAP_TYPE_NONE;
bug = find_bug(bugaddr);
if (!bug)
return BUG_TRAP_TYPE_NONE;
...
if (file)
pr_crit("kernel BUG at %s:%u!\n", file, line);
else
pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n",
(void *)bugaddr);
return BUG_TRAP_TYPE_BUG;
}
int is_valid_bugaddr(unsigned long addr)
{
/*
* bug_handler() only called for BRK #BUG_BRK_IMM.
* So the answer is trivial -- any spurious instances with no
* bug table entry will be rejected by report_bug() and passed
* back to the debug-monitors code and handled as a fatal
* unexpected debug exception.
*/
return 1;
}
如果是bug,log里会报出来pr_crit("kernel BUG
。再大概看下bug_handler()
的触发:
bug_handler
就是bug_break_hook.fn
:
void __init trap_init(void)
{
register_kernel_break_hook(&bug_break_hook);
#ifdef CONFIG_KASAN_SW_TAGS
register_kernel_break_hook(&kasan_break_hook);
#endif
debug_traps_init();
}
先是register:
static LIST_HEAD(kernel_break_hook);
void register_kernel_break_hook(struct break_hook *hook)
{
register_debug_hook(&hook->node, &kernel_break_hook);
}
static void register_debug_hook(struct list_head *node, struct list_head *list)
{
spin_lock(&debug_hook_lock);
list_add_rcu(node, list);
spin_unlock(&debug_hook_lock);
}
增加到list kernel_break_hook
。然后初始化:
#define DBG_ESR_EVT_BRK 0x6
void __init debug_traps_init(void)
{
hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP,
TRAP_TRACE, "single-step handler");
hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP,
TRAP_BRKPT, "ptrace BRK handler");
}
void __init hook_debug_fault_code(int nr,
int (*fn)(unsigned long, unsigned int, struct pt_regs *),
int sig, int code, const char *name)
{
BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
debug_fault_info[nr].fn = fn; //tj: brk_handler
debug_fault_info[nr].sig = sig;
debug_fault_info[nr].code = code;
debug_fault_info[nr].name = name;
}
static struct fault_info __refdata debug_fault_info[] = {
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
{ do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
{ do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
{ early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" }, //tj: here, index=6
{ do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
};
就是这里的early_bk64
被替换成了brk_handler
。
默认的early_brk64()
直接call bug_handler()
:
/*
* Initial handler for AArch64 BRK exceptions
* This handler only used until debug_traps_init().
*/
int __init early_brk64(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
#ifdef CONFIG_KASAN_SW_TAGS
unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;
if ((comment & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
}
而brk_handler()
会走hook:
static int brk_handler(unsigned long unused, unsigned int esr,
struct pt_regs *regs)
{
if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED)
return 0;
if (user_mode(regs)) {
send_user_sigtrap(TRAP_BRKPT);
} else {
pr_warn("Unexpected kernel BRK exception at EL1\n");
return -EFAULT;
}
return 0;
}
call_break_hook()
:
static int call_break_hook(struct pt_regs *regs, unsigned int esr)
{
struct break_hook *hook;
struct list_head *list;
int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL;
list = user_mode(regs) ? &user_break_hook : &kernel_break_hook;
/*
* Since brk exception disables interrupt, this function is
* entirely not preemptible, and we can use rcu list safely here.
*/
list_for_each_entry_rcu(hook, list, node) {
unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;
if ((comment & ~hook->mask) == hook->imm)
fn = hook->fn;
}
return fn ? fn(regs, esr) : DBG_HOOK_ERROR;
}
就是在list里找到hook->fn
,也就是bug_handler()
。
那BUG_ON()怎么进入的panic(),除了中断就是靠panic_on_oops
控制:
panic_on_oops
Controls the kernel's behaviour when an oops or BUG is encountered.
= ================================================
0 Try to continue operation.
1 Panic immediately. If thepanic
sysctl is also non-zero then the
machine will be rebooted.
= ================================================
Android一般在init.rc开启:
on init
...
write /proc/sys/kernel/panic_on_oops 1
WARN()
先看注释:
/*
* WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
* significant kernel issues that need prompt attention if they should ever
* appear at runtime.
*
* Do not use these macros when checking for invalid external inputs
* (e.g. invalid system call arguments, or invalid data coming from
* network/devices), and on transient conditions like ENOMEM or EAGAIN.
* These macros should be used for recoverable kernel issues only.
* For invalid external inputs, transient conditions, etc use
* pr_err[_once/_ratelimited]() followed by dump_stack(), if necessary.
* Do not include "BUG"/"WARNING" in format strings manually to make these
* conditions distinguishable from kernel issues.
*
* Use the versions with printk format strings to provide better diagnostics.
*/
WARN()系是用来报告一些可修复的(recoverable)内核问题,不是用来check入参啊,没内存啊等场景。
WARN()系有个区分__WARN_FLAGS
,与体系架构有关:
#ifndef __WARN_FLAGS
extern __printf(4, 5)
void warn_slowpath_fmt(const char *file, const int line, unsigned taint,
const char *fmt, ...);
#define __WARN() __WARN_printf(TAINT_WARN, NULL)
#define __WARN_printf(taint, arg...) do { \
instrumentation_begin(); \
warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \
instrumentation_end(); \
} while (0)
#else
extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
#define __WARN_printf(taint, arg...) do { \
instrumentation_begin(); \
__warn_printk(arg); \
__WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
instrumentation_end(); \
} while (0)
#define WARN_ON_ONCE(condition) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
__WARN_FLAGS(BUGFLAG_ONCE | \
BUGFLAG_TAINT(TAINT_WARN)); \
unlikely(__ret_warn_on); \
})
#endif
关注arm64:
#define __BUG_FLAGS(flags) \
asm volatile (__stringify(ASM_BUG_FLAGS(flags)));
#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
和arm64的BUG()
定义放一起的,BUGFLAG_WARNING
:
#ifdef CONFIG_GENERIC_BUG
#define BUGFLAG_WARNING (1 << 0) //tj: here
#define BUGFLAG_ONCE (1 << 1)
#define BUGFLAG_DONE (1 << 2)
#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */
#define BUGFLAG_TAINT(taint) ((taint) << 8) //tj
#define BUG_GET_TAINT(bug) ((bug)->flags >> 8)
#endif
#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
主要差异体现在__BUG_ENTRY
,具体涉及arm64汇编,这里不关注。
__WARN_printf
就是多了个log。
还有个__warn()
函数,lib/bug.c在report_bug()
会用:
enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
{
...
if (bug) {
...
warning = (bug->flags & BUGFLAG_WARNING) != 0;
...
if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0)
printk(KERN_DEFAULT CUT_HERE);
if (warning) {
/* this is a WARN_ON rather than BUG/BUG_ON */
__warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs,
NULL);
return BUG_TRAP_TYPE_WARN;
}
if (file)
pr_crit("kernel BUG at %s:%u!\n", file, line);
bug->flags
就是前面定义的哈,arm64 warn也触发的bug_handler()
。
__warn()
:
void __warn(const char *file, int line, void *caller, unsigned taint,
struct pt_regs *regs, struct warn_args *args)
{
disable_trace_on_warning();
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
raw_smp_processor_id(), current->pid, file, line,
caller);
else
pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
raw_smp_processor_id(), current->pid, caller);
if (args)
vprintk(args->fmt, args->args);
if (panic_on_warn) { //tj: here
/*
* This thread may hit another WARN() in the panic path.
* Resetting this prevents additional WARN() from panicking the
* system on this thread. Other threads are blocked by the
* panic_mutex in panic().
*/
panic_on_warn = 0;
panic("panic_on_warn set ...\n");
}
print_modules();
if (regs)
show_regs(regs);
else
dump_stack();
print_irqtrace_events(current);
print_oops_end_marker();
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
what? panic还能发生在warn上?没错,就是这个panic_on_warn
,看下缘由:
panic_on_warn
Calls panic() in the WARN() path when set to 1. This is useful to avoid
a kernel rebuild when attempting to kdump at the location of a WARN().= ================================================
0 Only WARN(), default behaviour.
1 Call panic() after printing out WARN() location.
= ================================================
kdump用时不用rebuild,ok。
tainted-kernels
一些Oops log会看到Tainted
字样如下:
35.449908: <6> CPU: 0 PID: 1 Comm: init Tainted: G S W 4.14.117-perf+ #65
就是内核被污染了,查问题时用得上。即使污染源被去除后,污染状态一直保留。
运行时状态查询在:/proc/sys/kernel/tainted
, bug, oops, panics都会打印出来。
arm64的die()
中就会增加污染标记:
void die(const char *str, struct pt_regs *regs, int err)
{
...
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); //tj
oops_exit();
void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
pr_warn("Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask); //tj
if (tainted_mask & panic_on_taint) {
panic_on_taint = 0;
panic("panic_on_taint set ...");
}
}
污染标记到tainted_mask
里。
/* This cannot be an enum because some may be used in assembly source. */
#define TAINT_PROPRIETARY_MODULE 0
#define TAINT_FORCED_MODULE 1
#define TAINT_CPU_OUT_OF_SPEC 2
#define TAINT_FORCED_RMMOD 3
#define TAINT_MACHINE_CHECK 4
#define TAINT_BAD_PAGE 5
#define TAINT_USER 6
#define TAINT_DIE 7 //tj
#define TAINT_OVERRIDDEN_ACPI_TABLE 8
#define TAINT_WARN 9
#define TAINT_CRAP 10
#define TAINT_FIRMWARE_WORKAROUND 11
#define TAINT_OOT_MODULE 12
#define TAINT_UNSIGNED_MODULE 13
#define TAINT_SOFTLOCKUP 14
#define TAINT_LIVEPATCH 15
#define TAINT_AUX 16
#define TAINT_RANDSTRUCT 17
#define TAINT_FLAGS_COUNT 18
这么多污染种类。打印污染状态时:
/**
* print_tainted - return a string to represent the kernel taint state.
*
* For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst
*
* The string is overwritten by the next call to print_tainted(),
* but is always NULL terminated.
*/
const char *print_tainted(void)
{
static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
if (tainted_mask) {
char *s;
int i;
s = buf + sprintf(buf, "Tainted: ");
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
const struct taint_flag *t = &taint_flags[i];
*s++ = test_bit(i, &tainted_mask) ? //tj
t->c_true : t->c_false;
}
*s = 0;
} else
snprintf(buf, sizeof(buf), "Not tainted");
return buf;
}
从tainted_mask
里取出之前设置的标记via test_bit(i, &tainted_mask)
。像WARN()
就设置了TAINT_WARN
。
那都被污染了要不要panic?取决于panic_on_taint
:
panic_on_taint= Bitmask for conditionally calling panic() in add_taint() Format: <hex>[,nousertaint] Hexadecimal bitmask representing the set of TAINT flags that will cause the kernel to panic when add_taint() is called with any of the flags in this set. The optional switch "nousertaint" can be utilized to prevent userspace forced crashes by writing to sysctl /proc/sys/kernel/tainted any flagset matching with the bitmask set on panic_on_taint. See Documentation/admin-guide/tainted-kernels.rst for extra details on the taint flags that users can pick to compose the bitmask to assign to panic_on_taint.
哪些污染要panic,你自己决定了。所以才会有tainted_mask & panic_on_taint
。
sysctl for panic
/proc/sys/kernel
xxx:/proc/sys/kernel # ls -l panic*
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic_on_oops
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic_on_rcu_stall
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic_on_warn
reference
- Documentation/admin-guide/sysctl/kernel.rst
- Documentation/admin-guide/tainted-kernels.rst
- Documentation/admin-guide/kernel-parameters.txt
本站采用CC BY-NC-SA 4.0进行许可 | 转载请注明原文链接 - Linux Kernel WARN()&BUG(), Oops&Panic, Tainted分析