QC armv7平台kernel 4.x出现的死机问题,一开始很随机以为是DDR硬件问题,直到后面死机越來越多,还出现过pc一样的现场,让人不得不怀疑是SW issue,同步提给QC看,看QC的答复就是用tracer32来定位的,下来温习了下armv7,完全可以用red hat的crash工具结合反汇编和源码分析定位,对小厂算是节约license成本了?

我们先看死机现场:

551 [   63.694025] Unable to handle kernel paging request at virtual address 383c0a1d
552 [   63.694034] pgd = db387da1
553 [   63.694036] [383c0a1d] *pgd=00000000
554 [   63.694045] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
555 [   63.694048] Modules linked in: machine_ext_dlkm(O) machine_dlkm(O) cpe_lsm_dlkm(O) wcd_cpe_dlkm(O) analog_cdc_dlkm(O) aw87359_dlkm(O) digital_cdc_dlkm(O) stub_dlkm(O) mbhc_dlkm(O) wcd    9xxx_dlkm(O) wcd_core_dlkm(O) swr_ctrl_dlkm(O) swr_dlkm(O) pinctrl_wcd_dlkm(O) native_dlkm(O) platform_dlkm(O) usf_dlkm(O) q6_dlkm(O) adsp_loader_dlkm(O) apr_dlkm(O) q6_notifier_dlkm(O)
558 [   63.694126] task: b4fb24a1 task.stack: 4ddc1adc
559 [   63.694137] PC is at kernfs_find_ns+0x78/0xf8
560 [   63.694143] LR is at kernfs_name_hash+0x10/0x68
561 [   63.694147] pc : [<c02dc290>]    lr : [<c02db85c>]    psr: 200b0013
562                sp : d9fa9d68  ip : 00000000  fp : 00000000
563 [   63.694150] r10: d9fa9ec0  r9 : 00000011  r8 : e47522a8
564 [   63.694153] r7 : 00000000  r6 : e4477bdc  r5 : 5d64df2d  r4 : 383c0a0d
565 [   63.694156] r3 : 7473616d  r2 : 00000031  r1 : 26e33aa1  r0 : 5d64df2d
...
599 [   63.694571] Stack: (0xd9fa9d68 to 0xd9faa000)
600 [   63.694577] 9d60:                   e47522a8 000409e3 00080040 ea0172d0 e4477bb0 e9d9bbe8
601 [   63.694584] 9d80: e9d9bc60 c02dc3d8 e4477bb0 c13091c8 e9d9bbe8 c026d0a8 00000000 d9fa9d9c
602 [   63.694591] 9da0: d9fa9d9c 000409e3 00000000 00000001 d9fa9eb8 00000000 c13091c8 d9fa9eb8
603 [   63.694599] 9dc0: d9fa8000 e47522a8 00000009 c0271234 c11532d0 e9d9bbe8 00000001 d9fa9eb8
604 [   63.694607] 9de0: d9fa8000 000409e3 b4ea6d82 61c88647 c13091c8 cedc902a d9fa9eb8 c0271488
605 [   63.694615] 9e00: 00000000 00000009 00000142 c03c2560 c03c2524 c1153ae8 c1153358 c03bead0
606 [   63.694623] 9e20: c50ed480 000409e3 d9fa9eb8 d9fa9eb8 d9fa9f70 cedc9010 00000001 c50ed480
607 [   63.694631] 9e40: d9fa8000 00000142 00000009 c0271a58 ebaa2908 c01a0b30 eba0e440 eba0e490
621 [   63.694751] [<c02dc290>] (kernfs_find_ns) from [<c02dc3d8>] (kernfs_iop_lookup+0x48/0xd8)
622 [   63.694760] [<c02dc3d8>] (kernfs_iop_lookup) from [<c026d0a8>] (lookup_slow+0x94/0x158)
623 [   63.694768] [<c026d0a8>] (lookup_slow) from [<c0271234>] (walk_component+0x1fc/0x2cc)
624 [   63.694775] [<c0271234>] (walk_component) from [<c0271488>] (link_path_walk+0x184/0x4e4)
625 [   63.694782] [<c0271488>] (link_path_walk) from [<c0271a58>] (path_openat+0x88/0xc70)
626 [   63.694789] [<c0271a58>] (path_openat) from [<c02739f8>] (do_filp_open+0x6c/0x110)
627 [   63.694797] [<c02739f8>] (do_filp_open) from [<c0261074>] (do_sys_open+0x128/0x23c)
628 [   63.694806] [<c0261074>] (do_sys_open) from [<c0108b40>] (ret_fast_syscall+0x0/0x54)
629 [   63.694814] Code: aa00000c e5944008 e3540000 0a000006 (e5943010)
630 [   63.694818] ---[ end trace 9656f9b4178497a9 ]---

非法地址访问,当前pc是c02dc290, 在kernfs_find_ns里。

crash.arm> sym c02dc290
c02dc290 (t) kernfs_find_ns+120 .../fs/kernfs/dir.c: 306

找到对应的c code:

 303 static int kernfs_name_compare(unsigned int hash, const char *name,
 304                                const void *ns, const struct kernfs_node *kn)
 305 {
 306         if (hash < kn->hash) //pc
 307                 return -1;

kernfs_find_ns() calls kernfs_name_compare():

 737 static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 738                                           const unsigned char *name,
 739                                           const void *ns)
 740 {
 741         struct rb_node *node = parent->dir.children.rb_node;
 742         bool has_ns = kernfs_ns_enabled(parent);
 743         unsigned int hash;
 744
 745         lockdep_assert_held(&kernfs_mutex);
 746
 747         if (has_ns != (bool)ns) {
 748                 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
 749                      has_ns ? "required" : "invalid", parent->name, name);
 750                 return NULL;
 751         }
 752
 753         hash = kernfs_name_hash(name, ns);
 754         while (node) {
 755                 struct kernfs_node *kn;
 756                 int result;
 757
 758                 kn = rb_to_kn(node);
 759                 result = kernfs_name_compare(hash, name, ns, kn); //tj: here
 760                 if (result < 0)
 761                         node = node->rb_left;
 762                 else if (result > 0)
 763                         node = node->rb_right;
 764                 else
 765                         return kn;
 766         }
 767         return NULL;
 768 }

kernfs_find_ns lr:c02dc3d8

crash.arm> sym c02dc3d8
c02dc3d8 (t) kernfs_iop_lookup+72 .../fs/kernfs/dir.c: 990
 972 static struct dentry *kernfs_iop_lookup(struct inode *dir,
 973                                         struct dentry *dentry,
 974                                         unsigned int flags)
 975 {
 976         struct dentry *ret;
 977         struct kernfs_node *parent = dentry->d_parent->d_fsdata;
 978         struct kernfs_node *kn;
 979         struct inode *inode;
 980         const void *ns = NULL;
 981
 982         mutex_lock(&kernfs_mutex);
 983
 984         if (kernfs_ns_enabled(parent))
 985                 ns = kernfs_info(dir->i_sb)->ns;
 986
 987         kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
 988
 989         /* no such entry */
 990         if (!kn || !kernfs_active(kn)) { //tj: here
 991                 ret = NULL;
 992                 goto out_unlock;
 993         }

ok, 先有个代码印象,下面看下出错附近的反汇编:

crash.arm> dis kernfs_find_ns
0xc02dc218 <kernfs_find_ns>:    push    {r4, r5, r6, r7, lr}
0xc02dc21c <kernfs_find_ns+4>:  adds    r12, r2, #0
0xc02dc220 <kernfs_find_ns+8>:  ldrh    r3, [r0, #68]   ; 0x44
0xc02dc224 <kernfs_find_ns+12>: mov     r6, r1
0xc02dc228 <kernfs_find_ns+16>: movne   r12, #1
0xc02dc22c <kernfs_find_ns+20>: sub     sp, sp, #12
0xc02dc230 <kernfs_find_ns+24>: ubfx    r1, r3, #5, #1
0xc02dc234 <kernfs_find_ns+28>: ldr     r4, [r0, #44]   ; 0x2c
0xc02dc238 <kernfs_find_ns+32>: cmp     r1, r12
0xc02dc23c <kernfs_find_ns+36>: bne     0xc02dc2c8 <kernfs_find_ns+176>
0xc02dc240 <kernfs_find_ns+40>: mov     r0, r6
0xc02dc244 <kernfs_find_ns+44>: mov     r1, r2
0xc02dc248 <kernfs_find_ns+48>: mov     r7, r2
0xc02dc24c <kernfs_find_ns+52>: bl      0xc02db84c <kernfs_name_hash>
0xc02dc250 <kernfs_find_ns+56>: cmp     r4, #0
0xc02dc254 <kernfs_find_ns+60>: mov     r5, r0
0xc02dc258 <kernfs_find_ns+64>: bne     0xc02dc290 <kernfs_find_ns+120>
0xc02dc25c <kernfs_find_ns+68>: b       0xc02dc308 <kernfs_find_ns+240>
0xc02dc260 <kernfs_find_ns+72>: ldr     r3, [r4, #12]
0xc02dc264 <kernfs_find_ns+76>: mov     r0, r6
0xc02dc268 <kernfs_find_ns+80>: cmp     r7, r3
0xc02dc26c <kernfs_find_ns+84>: bcc     0xc02dc284 <kernfs_find_ns+108>
0xc02dc270 <kernfs_find_ns+88>: bhi     0xc02dc2a0 <kernfs_find_ns+136>
0xc02dc274 <kernfs_find_ns+92>: ldr     r1, [r4, #-4]
0xc02dc278 <kernfs_find_ns+96>: bl      0xc0444610 <strcmp>
0xc02dc27c <kernfs_find_ns+100>:        cmp     r0, #0
0xc02dc280 <kernfs_find_ns+104>:        bge     0xc02dc2b8 <kernfs_find_ns+160>
0xc02dc284 <kernfs_find_ns+108>:        ldr     r4, [r4, #8]
0xc02dc288 <kernfs_find_ns+112>:        cmp     r4, #0
0xc02dc28c <kernfs_find_ns+116>:        beq     0xc02dc2ac <kernfs_find_ns+148>
0xc02dc290 <kernfs_find_ns+120>:        ldr     r3, [r4, #16]
0xc02dc294 <kernfs_find_ns+124>:        cmp     r5, r3
0xc02dc298 <kernfs_find_ns+128>:        bcc     0xc02dc284 <kernfs_find_ns+108>
0xc02dc29c <kernfs_find_ns+132>:        bls     0xc02dc260 <kernfs_find_ns+72>
0xc02dc2a0 <kernfs_find_ns+136>:        ldr     r4, [r4, #4]
0xc02dc2a4 <kernfs_find_ns+140>:        cmp     r4, #0
0xc02dc2a8 <kernfs_find_ns+144>:        bne     0xc02dc290 <kernfs_find_ns+120>
0xc02dc2ac <kernfs_find_ns+148>:        mov     r0, #0
0xc02dc2b0 <kernfs_find_ns+152>:        add     sp, sp, #12
0xc02dc2b4 <kernfs_find_ns+156>:        pop     {r4, r5, r6, r7, pc}
0xc02dc2b8 <kernfs_find_ns+160>:        bne     0xc02dc2a0 <kernfs_find_ns+136>
0xc02dc2bc <kernfs_find_ns+164>:        sub     r0, r4, #16
0xc02dc2c0 <kernfs_find_ns+168>:        add     sp, sp, #12
0xc02dc2c4 <kernfs_find_ns+172>:        pop     {r4, r5, r6, r7, pc}
0xc02dc2c8 <kernfs_find_ns+176>:        ldr     lr, [r0, #12]
0xc02dc2cc <kernfs_find_ns+180>:        cmp     r1, #0
0xc02dc2d0 <kernfs_find_ns+184>:        movw    r12, #22808     ; 0x5918
0xc02dc2d4 <kernfs_find_ns+188>:        movw    r3, #46236      ; 0xb49c
0xc02dc2d8 <kernfs_find_ns+192>:        movt    r12, #49410     ; 0xc102
0xc02dc2dc <kernfs_find_ns+196>:        movw    r0, #46068      ; 0xb3f4
0xc02dc2e0 <kernfs_find_ns+200>:        movw    r2, #46244      ; 0xb4a4
0xc02dc2e4 <kernfs_find_ns+204>:        movt    r3, #49404      ; 0xc0fc
0xc02dc2e8 <kernfs_find_ns+208>:        str     r6, [sp, #4]
0xc02dc2ec <kernfs_find_ns+212>:        movt    r0, #49404      ; 0xc0fc
0xc02dc2f0 <kernfs_find_ns+216>:        str     lr, [sp]
0xc02dc2f4 <kernfs_find_ns+220>:        movne   r3, r12
0xc02dc2f8 <kernfs_find_ns+224>:        movt    r2, #49404      ; 0xc0fc
0xc02dc2fc <kernfs_find_ns+228>:        movw    r1, #749        ; 0x2ed
0xc02dc300 <kernfs_find_ns+232>:        bl      0xc012aecc <warn_slowpath_fmt>
0xc02dc304 <kernfs_find_ns+236>:        b       0xc02dc2ac <kernfs_find_ns+148>
0xc02dc308 <kernfs_find_ns+240>:        mov     r0, r4
0xc02dc30c <kernfs_find_ns+244>:        b       0xc02dc2b0 <kernfs_find_ns+152>

pc是这里:

0xc02dc290 <kernfs_find_ns+120>:        ldr     r3, [r4, #16]

死机现场r4 : 383c0a0d,r4+16就是383c0a1d了,也就是log出现的那个非法地址访问:

551 [   63.694025] Unable to handle kernel paging request at virtual address 383c0a1d

+16是啥?我们看c code:

 303 static int kernfs_name_compare(unsigned int hash, const char *name,
 304                                const void *ns, const struct kernfs_node *kn)
 305 {
 306         if (hash < kn->hash) //pc
 307                 return -1;

会是->hash吗?->hash的偏移:

crash.arm> struct kernfs_node.hash
struct kernfs_node {
  [32] unsigned int hash;
}

不对。继续看反汇编,可以借助-l打印出code行数帮助定位:

.../fs/kernfs/dir.c: 754
0xc02dc288 <kernfs_find_ns+112>:        cmp     r4, #0 
0xc02dc28c <kernfs_find_ns+116>:        beq     0xc02dc2ac <kernfs_find_ns+148>
.../fs/kernfs/dir.c: 306
0xc02dc290 <kernfs_find_ns+120>:        ldr     r3, [r4, #16] //tj: pc

754行:

 754         while (node) {
 755                 struct kernfs_node *kn;

0xc02dc2ac:

.../fs/kernfs/dir.c: 750
0xc02dc2ac <kernfs_find_ns+148>:        mov     r0, #0
.../fs/kernfs/dir.c: 768
0xc02dc2b0 <kernfs_find_ns+152>:        add     sp, sp, #12
0xc02dc2b4 <kernfs_find_ns+156>:        pop     {r4, r5, r6, r7, pc}
 750                 return NULL;
...
 767         return NULL;
 768 }

到这里可以推断r4是node,那这个推断对不对了,我们继续分析,后面主要会从反汇编/c/armv7栈以及crash工具的使用推导计算出log里的非法地址,大概还有一半多的内容。

剩余内容付款10.00元后3天内可查看