定屏问题分析 | TJ的技术博客

QCOM Android Q平台，现场adb不可用，TP不能用，Kernel 4.x。

热键进入ramdump mode，导出RAM后check kernel log，发现如下eMMC错误:

[41534.077689] mmc0: Reset 0x4 never completed.
[41534.077715] mmc0: sdhci: ============ SDHCI REGISTER DUMP ===========
[41534.077722] mmc0: sdhci: Sys addr:  0x00000000 | Version:  0x00007202
[41534.077728] mmc0: sdhci: Blk size:  0x00000200 | Blk cnt:  0x00000001
[41534.077733] mmc0: sdhci: Argument:  0x002c0046 | Trn mode: 0x00000013
...
[41534.077814] ----------- VENDOR REGISTER DUMP -----------
[41534.077816] ---- Debug RAM dump ----
[41534.077823] cmdq-host: Debug RAM wrap-around: 0x0000ff80 | Debug RAM overlap: 0x00000596
[41534.077828] cmdq-host: Debug RAM dump [0]: 0x0000014d
[41534.077834] cmdq-host: Debug RAM dump [1]: 0x0000006c

ok，上code:

/* hw clears the bit when it's done */
while (1) {
        bool timedout = ktime_after(ktime_get(), timeout);

        if (!(sdhci_readb(host, SDHCI_SOFTWARE_RESET) & mask))
                break;
        if (timedout) {
                pr_err("%s: Reset 0x%x never completed.\n", //tj: here
                        mmc_hostname(host->mmc), (int)mask);
                MMC_TRACE(host->mmc, "%s: Reset 0x%x never completed\n",
                                __func__, (int)mask);
                if ((host->quirks2 & SDHCI_QUIRK2_USE_RESET_WORKAROUND)
                        && host->ops->reset_workaround) {
                        if (!host->reset_wa_applied) {
                                /*
                                 * apply the workaround and issue
                                 * reset again.
                                 */
                                host->ops->reset_workaround(host, 1);
                                host->reset_wa_applied = 1;
                                host->reset_wa_cnt++;
                                goto retry_reset;
                        } else {
                                pr_err("%s: Reset 0x%x failed with workaround\n",
                                        mmc_hostname(host->mmc),
                                        (int)mask);
                                /* clear the workaround */
                                host->ops->reset_workaround(host, 0);
                                host->reset_wa_applied = 0;
                        }
                }

                sdhci_dumpregs(host); //tj:here
                return;
        }
        udelay(10); //tj: delay, if not timeout
}

void sdhci_dumpregs(struct sdhci_host *host)
{
        MMC_TRACE(host->mmc,
                "%s: 0x04=0x%08x 0x06=0x%08x 0x0E=0x%08x 0x30=0x%08x 0x34=0x%08x 0x38=0x%08x\n",
                __func__,
                sdhci_readw(host, SDHCI_BLOCK_SIZE),
                sdhci_readw(host, SDHCI_BLOCK_COUNT),
                sdhci_readw(host, SDHCI_COMMAND),
                sdhci_readl(host, SDHCI_INT_STATUS),
                sdhci_readl(host, SDHCI_INT_ENABLE),
                sdhci_readl(host, SDHCI_SIGNAL_ENABLE));
        mmc_stop_tracing(host->mmc);

        SDHCI_DUMP("============ SDHCI REGISTER DUMP ===========\n");

so，是timedout导致的这个问题，我们追下why timeout。

static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
{
        if (cmp1 < cmp2)
                return -1;
        if (cmp1 > cmp2)
                return 1;
        return 0;
}

/**
 * ktime_after - Compare if a ktime_t value is bigger than another one.
 * @cmp1:       comparable1
 * @cmp2:       comparable2
 *
 * Return: true if cmp1 happened after cmp2.
 */
static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
{
        return ktime_compare(cmp1, cmp2) > 0;
}

ok, 说明 cmp1>cmp2。也就是ktime_get() > timeout。把前面的code check下：

void sdhci_reset(struct sdhci_host *host, u8 mask)
{
        ktime_t timeout;

retry_reset:
        sdhci_writeb(host, mask, SDHCI_SOFTWARE_RESET);

        if (mask & SDHCI_RESET_ALL) {
                host->clock = 0;
                /* Reset-all turns off SD Bus Power */
                if (host->quirks2 & SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON)
                        sdhci_runtime_pm_bus_off(host);
        }

        /* Wait max 100 ms */
        timeout = ktime_add_ms(ktime_get(), 100);

        if (host->ops->check_power_status && host->pwr &&
            (mask & SDHCI_RESET_ALL))
                host->ops->check_power_status(host, REQ_BUS_OFF);

        /* clear pending normal/error interrupt status */
        sdhci_writel(host, sdhci_readl(host, SDHCI_INT_STATUS),
                        SDHCI_INT_STATUS);

static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec)
{
        return ktime_add_ns(kt, msec * NSEC_PER_MSEC);
}

/*
 * Add a ktime_t variable and a scalar nanosecond value.
 * res = kt + nsval:
 */
#define ktime_add_ns(kt, nsval)         ((kt) + (nsval))

timeout也就是ktime_get() + 100ms，也就是说过了100ms，SDHCI_SOFTWARE_RESET还没完成，而且也没有走workaround。

为啥会这样？一开始以为是个体问题，后来发现有app在ms级操作eMMC，就是他了:) although not confirm. rt?