QC armv7平台kernel 4.x出现的死机问题,一开始很随机以为是DDR硬件问题,直到后面死机越來越多,还出现过pc一样的现场,让人不得不怀疑是SW issue,同步提给QC看,看QC的答复就是用tracer32来定位的,下来温习了下armv7,完全可以用red hat的crash工具结合反汇编和源码分析定位,对小厂算是节约license成本了?

我们先看死机现场:

551 [   63.694025] Unable to handle kernel paging request at virtual address 383c0a1d
552 [ 63.694034] pgd = db387da1
553 [ 63.694036] [383c0a1d] *pgd=00000000
554 [ 63.694045] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
555 [ 63.694048] Modules linked in: machine_ext_dlkm(O) machine_dlkm(O) cpe_lsm_dlkm(O) wcd_cpe_dlkm(O) analog_cdc_dlkm(O) aw87359_dlkm(O) digital_cdc_dlkm(O) stub_dlkm(O) mbhc_dlkm(O) wcd 9xxx_dlkm(O) wcd_core_dlkm(O) swr_ctrl_dlkm(O) swr_dlkm(O) pinctrl_wcd_dlkm(O) native_dlkm(O) platform_dlkm(O) usf_dlkm(O) q6_dlkm(O) adsp_loader_dlkm(O) apr_dlkm(O) q6_notifier_dlkm(O)
558 [ 63.694126] task: b4fb24a1 task.stack: 4ddc1adc
559 [ 63.694137] PC is at kernfs_find_ns+0x78/0xf8
560 [ 63.694143] LR is at kernfs_name_hash+0x10/0x68
561 [ 63.694147] pc : [<c02dc290>] lr : [<c02db85c>] psr: 200b0013
562 sp : d9fa9d68 ip : 00000000 fp : 00000000
563 [ 63.694150] r10: d9fa9ec0 r9 : 00000011 r8 : e47522a8
564 [ 63.694153] r7 : 00000000 r6 : e4477bdc r5 : 5d64df2d r4 : 383c0a0d
565 [ 63.694156] r3 : 7473616d r2 : 00000031 r1 : 26e33aa1 r0 : 5d64df2d
...
599 [ 63.694571] Stack: (0xd9fa9d68 to 0xd9faa000)
600 [ 63.694577] 9d60: e47522a8 000409e3 00080040 ea0172d0 e4477bb0 e9d9bbe8
601 [ 63.694584] 9d80: e9d9bc60 c02dc3d8 e4477bb0 c13091c8 e9d9bbe8 c026d0a8 00000000 d9fa9d9c
602 [ 63.694591] 9da0: d9fa9d9c 000409e3 00000000 00000001 d9fa9eb8 00000000 c13091c8 d9fa9eb8
603 [ 63.694599] 9dc0: d9fa8000 e47522a8 00000009 c0271234 c11532d0 e9d9bbe8 00000001 d9fa9eb8
604 [ 63.694607] 9de0: d9fa8000 000409e3 b4ea6d82 61c88647 c13091c8 cedc902a d9fa9eb8 c0271488
605 [ 63.694615] 9e00: 00000000 00000009 00000142 c03c2560 c03c2524 c1153ae8 c1153358 c03bead0
606 [ 63.694623] 9e20: c50ed480 000409e3 d9fa9eb8 d9fa9eb8 d9fa9f70 cedc9010 00000001 c50ed480
607 [ 63.694631] 9e40: d9fa8000 00000142 00000009 c0271a58 ebaa2908 c01a0b30 eba0e440 eba0e490
621 [ 63.694751] [<c02dc290>] (kernfs_find_ns) from [<c02dc3d8>] (kernfs_iop_lookup+0x48/0xd8)
622 [ 63.694760] [<c02dc3d8>] (kernfs_iop_lookup) from [<c026d0a8>] (lookup_slow+0x94/0x158)
623 [ 63.694768] [<c026d0a8>] (lookup_slow) from [<c0271234>] (walk_component+0x1fc/0x2cc)
624 [ 63.694775] [<c0271234>] (walk_component) from [<c0271488>] (link_path_walk+0x184/0x4e4)
625 [ 63.694782] [<c0271488>] (link_path_walk) from [<c0271a58>] (path_openat+0x88/0xc70)
626 [ 63.694789] [<c0271a58>] (path_openat) from [<c02739f8>] (do_filp_open+0x6c/0x110)
627 [ 63.694797] [<c02739f8>] (do_filp_open) from [<c0261074>] (do_sys_open+0x128/0x23c)
628 [ 63.694806] [<c0261074>] (do_sys_open) from [<c0108b40>] (ret_fast_syscall+0x0/0x54)
629 [ 63.694814] Code: aa00000c e5944008 e3540000 0a000006 (e5943010)
630 [ 63.694818] ---[ end trace 9656f9b4178497a9 ]---

非法地址访问,当前pc是c02dc290, 在kernfs_find_ns里。

crash.arm> sym c02dc290
c02dc290 (t) kernfs_find_ns+120 .../fs/kernfs/dir.c: 306

找到对应的c code:

303 static int kernfs_name_compare(unsigned int hash, const char *name,
304 const void *ns, const struct kernfs_node *kn)
305 {
306 if (hash < kn->hash) //pc
307 return -1;

kernfs_find_ns() calls kernfs_name_compare():

737 static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
738 const unsigned char *name,
739 const void *ns)
740 {
741 struct rb_node *node = parent->dir.children.rb_node;
742 bool has_ns = kernfs_ns_enabled(parent);
743 unsigned int hash;
744
745 lockdep_assert_held(&kernfs_mutex);
746
747 if (has_ns != (bool)ns) {
748 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
749 has_ns ? "required" : "invalid", parent->name, name);
750 return NULL;
751 }
752
753 hash = kernfs_name_hash(name, ns);
754 while (node) {
755 struct kernfs_node *kn;
756 int result;
757
758 kn = rb_to_kn(node);
759 result = kernfs_name_compare(hash, name, ns, kn); //tj: here
760 if (result < 0)
761 node = node->rb_left;
762 else if (result > 0)
763 node = node->rb_right;
764 else
765 return kn;
766 }
767 return NULL;
768 }

kernfs_find_ns lr:c02dc3d8

crash.arm> sym c02dc3d8
c02dc3d8 (t) kernfs_iop_lookup+72 .../fs/kernfs/dir.c: 990
972 static struct dentry *kernfs_iop_lookup(struct inode *dir,
973 struct dentry *dentry,
974 unsigned int flags)
975 {
976 struct dentry *ret;
977 struct kernfs_node *parent = dentry->d_parent->d_fsdata;
978 struct kernfs_node *kn;
979 struct inode *inode;
980 const void *ns = NULL;
981
982 mutex_lock(&kernfs_mutex);
983
984 if (kernfs_ns_enabled(parent))
985 ns = kernfs_info(dir->i_sb)->ns;
986
987 kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
988
989 /* no such entry */
990 if (!kn || !kernfs_active(kn)) { //tj: here
991 ret = NULL;
992 goto out_unlock;
993 }

ok, 先有个代码印象,下面看下出错附近的反汇编:

crash.arm> dis kernfs_find_ns
0xc02dc218 <kernfs_find_ns>: push {r4, r5, r6, r7, lr}
0xc02dc21c <kernfs_find_ns+4>: adds r12, r2, #0
0xc02dc220 <kernfs_find_ns+8>: ldrh r3, [r0, #68] ; 0x44
0xc02dc224 <kernfs_find_ns+12>: mov r6, r1
0xc02dc228 <kernfs_find_ns+16>: movne r12, #1
0xc02dc22c <kernfs_find_ns+20>: sub sp, sp, #12
0xc02dc230 <kernfs_find_ns+24>: ubfx r1, r3, #5, #1
0xc02dc234 <kernfs_find_ns+28>: ldr r4, [r0, #44] ; 0x2c
0xc02dc238 <kernfs_find_ns+32>: cmp r1, r12
0xc02dc23c <kernfs_find_ns+36>: bne 0xc02dc2c8 <kernfs_find_ns+176>
0xc02dc240 <kernfs_find_ns+40>: mov r0, r6
0xc02dc244 <kernfs_find_ns+44>: mov r1, r2
0xc02dc248 <kernfs_find_ns+48>: mov r7, r2
0xc02dc24c <kernfs_find_ns+52>: bl 0xc02db84c <kernfs_name_hash>
0xc02dc250 <kernfs_find_ns+56>: cmp r4, #0
0xc02dc254 <kernfs_find_ns+60>: mov r5, r0
0xc02dc258 <kernfs_find_ns+64>: bne 0xc02dc290 <kernfs_find_ns+120>
0xc02dc25c <kernfs_find_ns+68>: b 0xc02dc308 <kernfs_find_ns+240>
0xc02dc260 <kernfs_find_ns+72>: ldr r3, [r4, #12]
0xc02dc264 <kernfs_find_ns+76>: mov r0, r6
0xc02dc268 <kernfs_find_ns+80>: cmp r7, r3
0xc02dc26c <kernfs_find_ns+84>: bcc 0xc02dc284 <kernfs_find_ns+108>
0xc02dc270 <kernfs_find_ns+88>: bhi 0xc02dc2a0 <kernfs_find_ns+136>
0xc02dc274 <kernfs_find_ns+92>: ldr r1, [r4, #-4]
0xc02dc278 <kernfs_find_ns+96>: bl 0xc0444610 <strcmp>
0xc02dc27c <kernfs_find_ns+100>: cmp r0, #0
0xc02dc280 <kernfs_find_ns+104>: bge 0xc02dc2b8 <kernfs_find_ns+160>
0xc02dc284 <kernfs_find_ns+108>: ldr r4, [r4, #8]
0xc02dc288 <kernfs_find_ns+112>: cmp r4, #0
0xc02dc28c <kernfs_find_ns+116>: beq 0xc02dc2ac <kernfs_find_ns+148>
0xc02dc290 <kernfs_find_ns+120>: ldr r3, [r4, #16]
0xc02dc294 <kernfs_find_ns+124>: cmp r5, r3
0xc02dc298 <kernfs_find_ns+128>: bcc 0xc02dc284 <kernfs_find_ns+108>
0xc02dc29c <kernfs_find_ns+132>: bls 0xc02dc260 <kernfs_find_ns+72>
0xc02dc2a0 <kernfs_find_ns+136>: ldr r4, [r4, #4]
0xc02dc2a4 <kernfs_find_ns+140>: cmp r4, #0
0xc02dc2a8 <kernfs_find_ns+144>: bne 0xc02dc290 <kernfs_find_ns+120>
0xc02dc2ac <kernfs_find_ns+148>: mov r0, #0
0xc02dc2b0 <kernfs_find_ns+152>: add sp, sp, #12
0xc02dc2b4 <kernfs_find_ns+156>: pop {r4, r5, r6, r7, pc}
0xc02dc2b8 <kernfs_find_ns+160>: bne 0xc02dc2a0 <kernfs_find_ns+136>
0xc02dc2bc <kernfs_find_ns+164>: sub r0, r4, #16
0xc02dc2c0 <kernfs_find_ns+168>: add sp, sp, #12
0xc02dc2c4 <kernfs_find_ns+172>: pop {r4, r5, r6, r7, pc}
0xc02dc2c8 <kernfs_find_ns+176>: ldr lr, [r0, #12]
0xc02dc2cc <kernfs_find_ns+180>: cmp r1, #0
0xc02dc2d0 <kernfs_find_ns+184>: movw r12, #22808 ; 0x5918
0xc02dc2d4 <kernfs_find_ns+188>: movw r3, #46236 ; 0xb49c
0xc02dc2d8 <kernfs_find_ns+192>: movt r12, #49410 ; 0xc102
0xc02dc2dc <kernfs_find_ns+196>: movw r0, #46068 ; 0xb3f4
0xc02dc2e0 <kernfs_find_ns+200>: movw r2, #46244 ; 0xb4a4
0xc02dc2e4 <kernfs_find_ns+204>: movt r3, #49404 ; 0xc0fc
0xc02dc2e8 <kernfs_find_ns+208>: str r6, [sp, #4]
0xc02dc2ec <kernfs_find_ns+212>: movt r0, #49404 ; 0xc0fc
0xc02dc2f0 <kernfs_find_ns+216>: str lr, [sp]
0xc02dc2f4 <kernfs_find_ns+220>: movne r3, r12
0xc02dc2f8 <kernfs_find_ns+224>: movt r2, #49404 ; 0xc0fc
0xc02dc2fc <kernfs_find_ns+228>: movw r1, #749 ; 0x2ed
0xc02dc300 <kernfs_find_ns+232>: bl 0xc012aecc <warn_slowpath_fmt>
0xc02dc304 <kernfs_find_ns+236>: b 0xc02dc2ac <kernfs_find_ns+148>
0xc02dc308 <kernfs_find_ns+240>: mov r0, r4
0xc02dc30c <kernfs_find_ns+244>: b 0xc02dc2b0 <kernfs_find_ns+152>

pc是这里:

0xc02dc290 <kernfs_find_ns+120>:        ldr     r3, [r4, #16]

死机现场r4 : 383c0a0d,r4+16就是383c0a1d了,也就是log出现的那个非法地址访问:

551 [   63.694025] Unable to handle kernel paging request at virtual address 383c0a1d

+16是啥?我们看c code:

303 static int kernfs_name_compare(unsigned int hash, const char *name,
304 const void *ns, const struct kernfs_node *kn)
305 {
306 if (hash < kn->hash) //pc
307 return -1;

会是->hash吗?->hash的偏移:

crash.arm> struct kernfs_node.hash
struct kernfs_node {
[32] unsigned int hash;
}

不对。继续看反汇编,可以借助-l打印出code行数帮助定位:

.../fs/kernfs/dir.c: 754
0xc02dc288 <kernfs_find_ns+112>: cmp r4, #0
0xc02dc28c <kernfs_find_ns+116>: beq 0xc02dc2ac <kernfs_find_ns+148>
.../fs/kernfs/dir.c: 306
0xc02dc290 <kernfs_find_ns+120>: ldr r3, [r4, #16] //tj: pc

754行:

754         while (node) {
755 struct kernfs_node *kn;

0xc02dc2ac:

.../fs/kernfs/dir.c: 750
0xc02dc2ac <kernfs_find_ns+148>: mov r0, #0
.../fs/kernfs/dir.c: 768
0xc02dc2b0 <kernfs_find_ns+152>: add sp, sp, #12
0xc02dc2b4 <kernfs_find_ns+156>: pop {r4, r5, r6, r7, pc}
 750                 return NULL;
...
767 return NULL;
768 }

到这里可以推断r4是node,那这个推断对不对了,我们继续分析,后面主要会从反汇编/c/armv7栈以及crash工具的使用推导计算出log里的非法地址,大概还有一半多的内容。