稳定性范畴, 参考5.x kernel。

kernel Oops

Oops指的就是内核的不正确行为,比如对驱动来说:

static int i82092aa_pci_probe(struct pci_dev *dev,
const struct pci_device_id *id)
{
unsigned char configbyte;
int i, ret;

ret = pci_enable_device(dev);
if (ret)
return ret;

/* PCI Configuration Control */
pci_read_config_byte(dev, 0x40, &configbyte);

switch (configbyte&6) {
case 0:
socket_count = 2;
break;
case 2:
socket_count = 1;
break;
case 4:
case 6:
socket_count = 4;
break;

default:
dev_err(&dev->dev,
"Oops, you did something we didn't think of.\n");
ret = -EIO;
goto err_out_disable;
}

这里的PCI配置读出来有异常,我们就认为他是一个Oops,打印一个错误,探测失败。

分配内存失败也算一种Oops,只不过不需要打出错误信息。

td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC);
fs_dprintk (FS_DEBUG_ALLOC, "Alloc transd: %p(%zd)\n", td, sizeof (struct FS_BPENTRY));
if (!td) {
/* Oops out of mem */
return -ENOMEM;
}

在体系架构方面的Oops,比如arm64的bug Oops:

static int bug_handler(struct pt_regs *regs, unsigned int esr)
{
switch (report_bug(regs->pc, regs)) {
case BUG_TRAP_TYPE_BUG:
die("Oops - BUG", regs, 0);
break;

如果report_bug()返回的是BUG_TRAP_TYPE_BUG,那就报个Oops log。

再比如非法访问也会走die("Oops", ):

static void die_kernel_fault(const char *msg, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
bust_spinlocks(1);

pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
addr);

mem_abort_decode(esr);

show_pte(addr);
die("Oops", regs, esr); //tj
bust_spinlocks(0);
do_exit(SIGKILL);
}

看下die():

void die(const char *str, struct pt_regs *regs, int err)
{
int ret;
unsigned long flags;

raw_spin_lock_irqsave(&die_lock, flags);

oops_enter();

console_verbose();
bust_spinlocks(1);
ret = __die(str, err, regs); //tj

if (regs && kexec_should_crash(current))
crash_kexec(regs);

bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
oops_exit();

if (in_interrupt())
panic("Fatal exception in interrupt"); //tj
if (panic_on_oops)
panic("Fatal exception"); //tj

raw_spin_unlock_irqrestore(&die_lock, flags);

if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}

再看下__die():

static int __die(const char *str, int err, struct pt_regs *regs)
{
static int die_counter;
int ret;

pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n",
str, err, ++die_counter);

/* trap and error numbers are mostly meaningless on ARM */
ret = notify_die(DIE_OOPS, str, regs, err, 0, SIGSEGV);
if (ret == NOTIFY_STOP)
return ret;

print_modules();
show_regs(regs);

dump_kernel_instr(KERN_EMERG, regs);

return ret;
}

打印类似如下log:

35.449887:   <6> Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
35.449893: <6> Modules linked in:
35.449901: <6> Process init (pid: 1, stack limit = 0x00000000826895f7)

后面会call panic(),不过是有条件的:

if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");

如果这个Oops在中断里,会走panic()。如果不在但if (panic_on_oops)成立,也走panic()

可见,Oops不一定会导致panic。bug_handle()BUG_TRAP_TYPE_BUG还不默认panic?

btw: arm64的Oops是怎么触发的了?稍后看。

Kernel panic

kernel panic就是不可恢复的错误了,怎么处理?我想复位or我就想定这。

/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups.
*
* This function never returns.
*/
void panic(const char *fmt, ...)
{
...
pr_emerg("Kernel panic - not syncing: %s\n", buf);
...
if (panic_timeout > 0) { //tj: 延迟重启
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
pr_emerg("Rebooting in %d seconds..\n", panic_timeout); //tj

for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
}
if (panic_timeout != 0) { //tj: 立即reboot
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
if (panic_reboot_mode != REBOOT_UNDEFINED)
reboot_mode = panic_reboot_mode;
emergency_restart();
}
...
pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); //tj: 一直卡这

/* Do not scroll important messages printed above */
suppress_printk = 1;
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
config PANIC_TIMEOUT
int "panic timeout"
default 0
help
Set the timeout value (in seconds) until a reboot occurs when the
the kernel panics. If n = 0, then we wait forever. A timeout
value n > 0 will wait n seconds before rebooting, while a timeout
value n < 0 will reboot immediately.

这个panic timeout在Kconfig里说的很清楚。

BUG() ifndef HAVE_ARCH_BUG

先看代码注释:

/*
* Don't use BUG() or BUG_ON() unless there's really no way out; one
* example might be detecting data structure corruption in the middle
* of an operation that can't be backed out of. If the (sub)system
* can somehow continue operating, perhaps with reduced functionality,
* it's probably not BUG-worthy.
*
* If you're tempted to BUG(), think again: is completely giving up
* really the *only* solution? There are usually better options, where
* users don't need to reboot ASAP and can mostly shut down cleanly.
*/
#ifndef HAVE_ARCH_BUG
#define BUG() do { \
printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \
barrier_before_unreachable(); \
panic("BUG!"); \
} while (0)
#endif

无路可走了?那就call me。如果只是丢失功能,系统还能继续跑,那就不算BUG。我就call你debug不行啊:)

这里有个宏HAVE_ARCH_BUG,没有实现arch bug,那就用它了,会直接导致panic,这是bug嘛,当然panic,感觉哪里不对劲?

BUG() on arm64

看下arm64的实现:

#define __BUG_FLAGS(flags)                              \
asm volatile (__stringify(ASM_BUG_FLAGS(flags)));

#define BUG() do { \
__BUG_FLAGS(0); \
unreachable(); \
} while (0)

#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))

#define HAVE_ARCH_BUG //tj: define

#include <asm-generic/bug.h>
#ifdef CONFIG_GENERIC_BUG

#define __BUG_ENTRY(flags) \
.pushsection __bug_table,"aw"; \
.align 2; \
14470: .long 14471f - 14470b; \
_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \
.short flags; \
.popsection; \
14471:
#else
#define ASM_BUG_FLAGS(flags)                            \
__BUG_ENTRY(flags) \
brk BUG_BRK_IMM
/*
* #imm16 values used for BRK instruction generation
* ...
* 0x800: kernel-mode BUG() and WARN() traps
* ...
*/
#define BUG_BRK_IMM 0x800

arm64的BUG()就是抛个brk 0x800指令,注释也写明了。

bug_handler()就是对应这个的处理。

static int bug_handler(struct pt_regs *regs, unsigned int esr)
{
switch (report_bug(regs->pc, regs)) { //tj: report_bug()
case BUG_TRAP_TYPE_BUG:
die("Oops - BUG", regs, 0);
break;

case BUG_TRAP_TYPE_WARN:
break;

default:
/* unknown/unrecognised bug trap type */
return DBG_HOOK_ERROR;
}

/* If thread survives, skip over the BUG instruction and continue: */
arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
return DBG_HOOK_HANDLED;
}

static struct break_hook bug_break_hook = {
.fn = bug_handler,
.imm = BUG_BRK_IMM,
};

report bug:

enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
{
...
if (!is_valid_bugaddr(bugaddr))
return BUG_TRAP_TYPE_NONE;

bug = find_bug(bugaddr);
if (!bug)
return BUG_TRAP_TYPE_NONE;
...
if (file)
pr_crit("kernel BUG at %s:%u!\n", file, line);
else
pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n",
(void *)bugaddr);

return BUG_TRAP_TYPE_BUG;
}
int is_valid_bugaddr(unsigned long addr)
{
/*
* bug_handler() only called for BRK #BUG_BRK_IMM.
* So the answer is trivial -- any spurious instances with no
* bug table entry will be rejected by report_bug() and passed
* back to the debug-monitors code and handled as a fatal
* unexpected debug exception.
*/
return 1;
}

如果是bug,log里会报出来pr_crit("kernel BUG 。再大概看下bug_handler()的触发:

bug_handler就是bug_break_hook.fn:

void __init trap_init(void)
{
register_kernel_break_hook(&bug_break_hook);
#ifdef CONFIG_KASAN_SW_TAGS
register_kernel_break_hook(&kasan_break_hook);
#endif
debug_traps_init();
}

先是register:

static LIST_HEAD(kernel_break_hook);

void register_kernel_break_hook(struct break_hook *hook)
{
register_debug_hook(&hook->node, &kernel_break_hook);
}
static void register_debug_hook(struct list_head *node, struct list_head *list)
{
spin_lock(&debug_hook_lock);
list_add_rcu(node, list);
spin_unlock(&debug_hook_lock);

}

增加到list kernel_break_hook。然后初始化:

#define DBG_ESR_EVT_BRK         0x6

void __init debug_traps_init(void)
{
hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP,
TRAP_TRACE, "single-step handler");
hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP,
TRAP_BRKPT, "ptrace BRK handler");
}
void __init hook_debug_fault_code(int nr,
int (*fn)(unsigned long, unsigned int, struct pt_regs *),
int sig, int code, const char *name)
{
BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));

debug_fault_info[nr].fn = fn; //tj: brk_handler
debug_fault_info[nr].sig = sig;
debug_fault_info[nr].code = code;
debug_fault_info[nr].name = name;
}
static struct fault_info __refdata debug_fault_info[] = {
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
{ do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
{ do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
{ early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" }, //tj: here, index=6
{ do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
};

就是这里的early_bk64被替换成了brk_handler

默认的early_brk64()直接call bug_handler():

/*
* Initial handler for AArch64 BRK exceptions
* This handler only used until debug_traps_init().
*/
int __init early_brk64(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
#ifdef CONFIG_KASAN_SW_TAGS
unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;

if ((comment & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
}

brk_handler()会走hook:

static int brk_handler(unsigned long unused, unsigned int esr,
struct pt_regs *regs)
{
if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED)
return 0;

if (user_mode(regs)) {
send_user_sigtrap(TRAP_BRKPT);
} else {
pr_warn("Unexpected kernel BRK exception at EL1\n");
return -EFAULT;
}

return 0;
}

call_break_hook():

static int call_break_hook(struct pt_regs *regs, unsigned int esr)
{
struct break_hook *hook;
struct list_head *list;
int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL;

list = user_mode(regs) ? &user_break_hook : &kernel_break_hook;

/*
* Since brk exception disables interrupt, this function is
* entirely not preemptible, and we can use rcu list safely here.
*/
list_for_each_entry_rcu(hook, list, node) {
unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;

if ((comment & ~hook->mask) == hook->imm)
fn = hook->fn;
}

return fn ? fn(regs, esr) : DBG_HOOK_ERROR;
}

就是在list里找到hook->fn,也就是bug_handler()

那BUG_ON()怎么进入的panic(),除了中断就是靠panic_on_oops控制:

panic_on_oops

Controls the kernel’s behaviour when an oops or BUG is encountered.

= ================================================
0 Try to continue operation.
1 Panic immediately. If the panic sysctl is also non-zero then the
machine will be rebooted.
= ================================================

Android一般在init.rc开启:

on init
...
write /proc/sys/kernel/panic_on_oops 1

WARN()

先看注释:

/*
* WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
* significant kernel issues that need prompt attention if they should ever
* appear at runtime.
*
* Do not use these macros when checking for invalid external inputs
* (e.g. invalid system call arguments, or invalid data coming from
* network/devices), and on transient conditions like ENOMEM or EAGAIN.
* These macros should be used for recoverable kernel issues only.
* For invalid external inputs, transient conditions, etc use
* pr_err[_once/_ratelimited]() followed by dump_stack(), if necessary.
* Do not include "BUG"/"WARNING" in format strings manually to make these
* conditions distinguishable from kernel issues.
*
* Use the versions with printk format strings to provide better diagnostics.
*/

WARN()系是用来报告一些可修复的(recoverable)内核问题,不是用来check入参啊,没内存啊等场景。

WARN()系有个区分__WARN_FLAGS,与体系架构有关:

#ifndef __WARN_FLAGS
extern __printf(4, 5)
void warn_slowpath_fmt(const char *file, const int line, unsigned taint,
const char *fmt, ...);
#define __WARN() __WARN_printf(TAINT_WARN, NULL)
#define __WARN_printf(taint, arg...) do { \
instrumentation_begin(); \
warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \
instrumentation_end(); \
} while (0)
#else
extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
#define __WARN_printf(taint, arg...) do { \
instrumentation_begin(); \
__warn_printk(arg); \
__WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
instrumentation_end(); \
} while (0)
#define WARN_ON_ONCE(condition) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
__WARN_FLAGS(BUGFLAG_ONCE | \
BUGFLAG_TAINT(TAINT_WARN)); \
unlikely(__ret_warn_on); \
})
#endif

关注arm64:

#define __BUG_FLAGS(flags)                              \
asm volatile (__stringify(ASM_BUG_FLAGS(flags)));

#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))

和arm64的BUG()定义放一起的,BUGFLAG_WARNING:

#ifdef CONFIG_GENERIC_BUG
#define BUGFLAG_WARNING (1 << 0) //tj: here
#define BUGFLAG_ONCE (1 << 1)
#define BUGFLAG_DONE (1 << 2)
#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */
#define BUGFLAG_TAINT(taint) ((taint) << 8) //tj
#define BUG_GET_TAINT(bug) ((bug)->flags >> 8)
#endif
#define __WARN()		__WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))

主要差异体现在__BUG_ENTRY,具体涉及arm64汇编,这里不关注。

__WARN_printf就是多了个log。

还有个__warn()函数,lib/bug.c在report_bug()会用:

enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
{
...
if (bug) {
...
warning = (bug->flags & BUGFLAG_WARNING) != 0;
...
if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0)
printk(KERN_DEFAULT CUT_HERE);

if (warning) {
/* this is a WARN_ON rather than BUG/BUG_ON */
__warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs,
NULL);
return BUG_TRAP_TYPE_WARN;
}

if (file)
pr_crit("kernel BUG at %s:%u!\n", file, line);

bug->flags就是前面定义的哈,arm64 warn也触发的bug_handler()

__warn():

void __warn(const char *file, int line, void *caller, unsigned taint,
struct pt_regs *regs, struct warn_args *args)
{
disable_trace_on_warning();

if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
raw_smp_processor_id(), current->pid, file, line,
caller);
else
pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
raw_smp_processor_id(), current->pid, caller);

if (args)
vprintk(args->fmt, args->args);

if (panic_on_warn) { //tj: here
/*
* This thread may hit another WARN() in the panic path.
* Resetting this prevents additional WARN() from panicking the
* system on this thread. Other threads are blocked by the
* panic_mutex in panic().
*/
panic_on_warn = 0;
panic("panic_on_warn set ...\n");
}

print_modules();

if (regs)
show_regs(regs);
else
dump_stack();

print_irqtrace_events(current);

print_oops_end_marker();

/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);

what? panic还能发生在warn上?没错,就是这个panic_on_warn,看下缘由:

panic_on_warn

Calls panic() in the WARN() path when set to 1. This is useful to avoid
a kernel rebuild when attempting to kdump at the location of a WARN().

= ================================================
0 Only WARN(), default behaviour.
1 Call panic() after printing out WARN() location.
= ================================================

kdump用时不用rebuild,ok。

tainted-kernels

一些Oops log会看到Tainted字样如下:

35.449908:   <6> CPU: 0 PID: 1 Comm: init Tainted: G S      W       4.14.117-perf+ #65

就是内核被污染了,查问题时用得上。即使污染源被去除后,污染状态一直保留。

运行时状态查询在:/proc/sys/kernel/tainted, bug, oops, panics都会打印出来。

arm64的die()中就会增加污染标记:

void die(const char *str, struct pt_regs *regs, int err)
{
...
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); //tj
oops_exit();
void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
pr_warn("Disabling lock debugging due to kernel taint\n");

set_bit(flag, &tainted_mask); //tj

if (tainted_mask & panic_on_taint) {
panic_on_taint = 0;
panic("panic_on_taint set ...");
}
}

污染标记到tainted_mask里。

/* This cannot be an enum because some may be used in assembly source. */
#define TAINT_PROPRIETARY_MODULE 0
#define TAINT_FORCED_MODULE 1
#define TAINT_CPU_OUT_OF_SPEC 2
#define TAINT_FORCED_RMMOD 3
#define TAINT_MACHINE_CHECK 4
#define TAINT_BAD_PAGE 5
#define TAINT_USER 6
#define TAINT_DIE 7 //tj
#define TAINT_OVERRIDDEN_ACPI_TABLE 8
#define TAINT_WARN 9
#define TAINT_CRAP 10
#define TAINT_FIRMWARE_WORKAROUND 11
#define TAINT_OOT_MODULE 12
#define TAINT_UNSIGNED_MODULE 13
#define TAINT_SOFTLOCKUP 14
#define TAINT_LIVEPATCH 15
#define TAINT_AUX 16
#define TAINT_RANDSTRUCT 17
#define TAINT_FLAGS_COUNT 18

这么多污染种类。打印污染状态时:

/**
* print_tainted - return a string to represent the kernel taint state.
*
* For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst
*
* The string is overwritten by the next call to print_tainted(),
* but is always NULL terminated.
*/
const char *print_tainted(void)
{
static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];

BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);

if (tainted_mask) {
char *s;
int i;

s = buf + sprintf(buf, "Tainted: ");
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
const struct taint_flag *t = &taint_flags[i];
*s++ = test_bit(i, &tainted_mask) ? //tj
t->c_true : t->c_false;
}
*s = 0;
} else
snprintf(buf, sizeof(buf), "Not tainted");

return buf;
}

tainted_mask里取出之前设置的标记via test_bit(i, &tainted_mask)。像WARN()就设置了TAINT_WARN

那都被污染了要不要panic?取决于panic_on_taint

   panic_on_taint= Bitmask for conditionally calling panic() in add_taint()
                   Format: <hex>[,nousertaint]
                   Hexadecimal bitmask representing the set of TAINT flags
                   that will cause the kernel to panic when add_taint() is
                   called with any of the flags in this set.
                   The optional switch "nousertaint" can be utilized to
                   prevent userspace forced crashes by writing to sysctl
                   /proc/sys/kernel/tainted any flagset matching with the
                   bitmask set on panic_on_taint.
                   See Documentation/admin-guide/tainted-kernels.rst for
                   extra details on the taint flags that users can pick
                   to compose the bitmask to assign to panic_on_taint.

哪些污染要panic,你自己决定了。所以才会有tainted_mask & panic_on_taint

sysctl for panic

/proc/sys/kernel

xxx:/proc/sys/kernel # ls -l panic*
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic_on_oops
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic_on_rcu_stall
-rw-r--r-- 1 root root 0 2020-07-11 20:32 panic_on_warn

reference

  • Documentation/admin-guide/sysctl/kernel.rst
  • Documentation/admin-guide/tainted-kernels.rst
  • Documentation/admin-guide/kernel-parameters.txt