QCOM Android Q平台,现场adb不可用,TP不能用,Kernel 4.x。

热键进入ramdump mode,导出RAM后check kernel log,发现如下eMMC错误:

[41534.077689] mmc0: Reset 0x4 never completed.
[41534.077715] mmc0: sdhci: ============ SDHCI REGISTER DUMP ===========
[41534.077722] mmc0: sdhci: Sys addr: 0x00000000 | Version: 0x00007202
[41534.077728] mmc0: sdhci: Blk size: 0x00000200 | Blk cnt: 0x00000001
[41534.077733] mmc0: sdhci: Argument: 0x002c0046 | Trn mode: 0x00000013
...
[41534.077814] ----------- VENDOR REGISTER DUMP -----------
[41534.077816] ---- Debug RAM dump ----
[41534.077823] cmdq-host: Debug RAM wrap-around: 0x0000ff80 | Debug RAM overlap: 0x00000596
[41534.077828] cmdq-host: Debug RAM dump [0]: 0x0000014d
[41534.077834] cmdq-host: Debug RAM dump [1]: 0x0000006c

ok,上code:

/* hw clears the bit when it's done */
while (1) {
bool timedout = ktime_after(ktime_get(), timeout);

if (!(sdhci_readb(host, SDHCI_SOFTWARE_RESET) & mask))
break;
if (timedout) {
pr_err("%s: Reset 0x%x never completed.\n", //tj: here
mmc_hostname(host->mmc), (int)mask);
MMC_TRACE(host->mmc, "%s: Reset 0x%x never completed\n",
__func__, (int)mask);
if ((host->quirks2 & SDHCI_QUIRK2_USE_RESET_WORKAROUND)
&& host->ops->reset_workaround) {
if (!host->reset_wa_applied) {
/*
* apply the workaround and issue
* reset again.
*/
host->ops->reset_workaround(host, 1);
host->reset_wa_applied = 1;
host->reset_wa_cnt++;
goto retry_reset;
} else {
pr_err("%s: Reset 0x%x failed with workaround\n",
mmc_hostname(host->mmc),
(int)mask);
/* clear the workaround */
host->ops->reset_workaround(host, 0);
host->reset_wa_applied = 0;
}
}

sdhci_dumpregs(host); //tj:here
return;
}
udelay(10); //tj: delay, if not timeout
}
void sdhci_dumpregs(struct sdhci_host *host)
{
MMC_TRACE(host->mmc,
"%s: 0x04=0x%08x 0x06=0x%08x 0x0E=0x%08x 0x30=0x%08x 0x34=0x%08x 0x38=0x%08x\n",
__func__,
sdhci_readw(host, SDHCI_BLOCK_SIZE),
sdhci_readw(host, SDHCI_BLOCK_COUNT),
sdhci_readw(host, SDHCI_COMMAND),
sdhci_readl(host, SDHCI_INT_STATUS),
sdhci_readl(host, SDHCI_INT_ENABLE),
sdhci_readl(host, SDHCI_SIGNAL_ENABLE));
mmc_stop_tracing(host->mmc);

SDHCI_DUMP("============ SDHCI REGISTER DUMP ===========\n");

so,是timedout导致的这个问题,我们追下why timeout。

static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
{
if (cmp1 < cmp2)
return -1;
if (cmp1 > cmp2)
return 1;
return 0;
}

/**
* ktime_after - Compare if a ktime_t value is bigger than another one.
* @cmp1: comparable1
* @cmp2: comparable2
*
* Return: true if cmp1 happened after cmp2.
*/
static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
{
return ktime_compare(cmp1, cmp2) > 0;
}

ok, 说明 cmp1>cmp2。也就是ktime_get() > timeout。把前面的code check下:

void sdhci_reset(struct sdhci_host *host, u8 mask)
{
ktime_t timeout;

retry_reset:
sdhci_writeb(host, mask, SDHCI_SOFTWARE_RESET);

if (mask & SDHCI_RESET_ALL) {
host->clock = 0;
/* Reset-all turns off SD Bus Power */
if (host->quirks2 & SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON)
sdhci_runtime_pm_bus_off(host);
}

/* Wait max 100 ms */
timeout = ktime_add_ms(ktime_get(), 100);

if (host->ops->check_power_status && host->pwr &&
(mask & SDHCI_RESET_ALL))
host->ops->check_power_status(host, REQ_BUS_OFF);

/* clear pending normal/error interrupt status */
sdhci_writel(host, sdhci_readl(host, SDHCI_INT_STATUS),
SDHCI_INT_STATUS);
static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec)
{
return ktime_add_ns(kt, msec * NSEC_PER_MSEC);
}

/*
* Add a ktime_t variable and a scalar nanosecond value.
* res = kt + nsval:
*/
#define ktime_add_ns(kt, nsval) ((kt) + (nsval))

timeout也就是ktime_get() + 100ms,也就是说过了100ms,SDHCI_SOFTWARE_RESET还没完成,而且也没有走workaround。

为啥会这样?一开始以为是个体问题,后来发现有app在ms级操作eMMC,就是他了:) although not confirm. rt?