eMMC CMDQ 超时分析 | TJ的技术博客

先看HW架构

CQE: Command Queueing Engine，负责管理software和eMMC device的data传输。

首先CQE接收来自SW的task(via TDL和doorbell)，接下来的command flows主要有三步：

Step1. Queuing a Transaction(CMD44+CMD45)

CQE发出CMD44/CMD45给eMMC用来queue a (data xfer) task, 当device有个R1响应就表示这个task已经queued in the device。可以通过读CQDPT寄存器来判断, CQE接收到R1响应就会置位，任务完成clear the bit。

Step2. Checking the Queue Status(CMD13)

CQE发出CMD13读QSR(Queue Status Register)来决定执行哪个task, device会反应个R1, 这个R1就是32bit value，每个bit对应一个task，如果bit=0，那这个task没有ready for execution，bit=1就是ready了。

Step3. Execution of a Queued Task(CMD46/CMD47)

CQE发出CMD46(读) or CMD47(写)给那些在QSR里已经”ready for execution”的tasks。

Linux代码分析

看下超时相关代码，msm kernel 4.14

mmc0: request with tag: 25 flags: 0x103001 timed out

find the code:

enum blk_eh_timer_return mmc_cmdq_rq_timed_out(struct request *req)
{
	struct mmc_queue *mq = req->q->queuedata;

	pr_err("%s: request with tag: %d flags: 0x%x timed out\n",
	       mmc_hostname(mq->card->host), req->tag, req->cmd_flags);

	return mq->cmdq_req_timed_out(req);
}

int mmc_cmdq_init(struct mmc_queue *mq, struct mmc_card *card)
{
	...
	blk_queue_rq_timed_out(mq->queue, mmc_cmdq_rq_timed_out); //tj: here
	blk_queue_rq_timeout(mq->queue, 120 * HZ); //tj: 120s
	card->cmdq_init = true;

	return ret;
}

block/blk-setting.c:

void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
	q->rq_timeout = timeout;
}

void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
{
	WARN_ON_ONCE(q->mq_ops);
	q->rq_timed_out_fn = fn;
}

看下who calls ->rq_timed_out_fn:

static void blk_rq_timed_out(struct request *req)
{
        struct request_queue *q = req->q;
        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;

        if (q->rq_timed_out_fn)
                ret = q->rq_timed_out_fn(req);  //tj: here
        switch (ret) {
        case BLK_EH_HANDLED:
                __blk_complete_request(req); //tj: here
                break;
        case BLK_EH_RESET_TIMER:
                blk_add_timer(req);
                blk_clear_rq_complete(req);
                break;
        case BLK_EH_NOT_HANDLED:
                /*
                 * LLD handles this for now but in the future
                 * we can send a request msg to abort the command
                 * and we can move more of the generic scsi eh code to
                 * the blk layer.
                 */
                break;
        default:
                printk(KERN_ERR "block: bad eh return: %d\n", ret);
                break;
        }
}

shoud be from blk_rq_check_expired():

static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
                          unsigned int *next_set)
{
        if (time_after_eq(jiffies, rq->deadline)) {
                list_del_init(&rq->timeout_list);

                /*
                 * Check if we raced with end io completion
                 */
                if (!blk_mark_rq_complete(rq))
                        blk_rq_timed_out(rq); //here
        }

这里的rq->deadline是120s，这个request超过了120s？这就是异常了。

void blk_timeout_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, timeout_work);
        unsigned long flags, next = 0;
        struct request *rq, *tmp;
        int next_set = 0;

        spin_lock_irqsave(q->queue_lock, flags);

        list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
                blk_rq_check_expired(rq, &next, &next_set);

INIT_WORK(&q->timeout_work, blk_timeout_work);

static void blk_rq_timed_out_timer(unsigned long data)
{
        struct request_queue *q = (struct request_queue *)data;

        kblockd_schedule_work(&q->timeout_work);
}

struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
{
	...
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
	...
}

who trigger this timer? should be blk_add_timer() when start request:

/**
 * blk_add_timer - Start timeout timer for a single request
 * @req:        request that is about to start running.
 *
 * Notes:
 *    Each request has its own timer, and as it is added to the queue, we
 *    set up the timer. When the request completes, we cancel the timer.
 */
void blk_add_timer(struct request *req)
{
        struct request_queue *q = req->q;
        unsigned long expiry;

        if (!q->mq_ops)
                lockdep_assert_held(q->queue_lock);

        /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
        if (!q->mq_ops && !q->rq_timed_out_fn)
                return;

        BUG_ON(!list_empty(&req->timeout_list));

        /*
         * Some LLDs, like scsi, peek at the timeout to prevent a
         * command from being retried forever.
         */
        if (!req->timeout)
                req->timeout = q->rq_timeout;

        req->deadline = jiffies + req->timeout;

看到没有, req->timeout就是120s，给了req->deadline。跟下：

blk_queue_start_tag() -> blk_start_request() -> blk_add_timer()

ok, 回到mmc driver:

static bool mmc_check_blk_queue_start_tag(struct request_queue *q,
					  struct request *req)
{
	int ret;

	spin_lock_irq(q->queue_lock);
	ret = blk_queue_start_tag(q, req); //tj:here
	spin_unlock_irq(q->queue_lock);

	return !!ret;
}

static inline void mmc_cmdq_ready_wait(struct mmc_host *host,
					struct mmc_queue *mq)
{
	struct mmc_cmdq_context_info *ctx = &host->cmdq_ctx;
	struct request_queue *q = mq->queue;

	/*
	 * Wait until all of the following conditions are true:
	 * 1. There is a request pending in the block layer queue
	 *    to be processed.
	 * 2. If the peeked request is flush/discard then there shouldn't
	 *    be any other direct command active.
	 * 3. cmdq state should be unhalted.
	 * 4. cmdq state shouldn't be in error state.
	 * 5. There is no outstanding RPMB request pending.
	 * 6. free tag available to process the new request.
	 *    (This must be the last condtion to check)
	 */
	wait_event(ctx->wait, kthread_should_stop()
		|| (mmc_peek_request(mq) &&
		!(((req_op(mq->cmdq_req_peeked) == REQ_OP_FLUSH) ||
		   (req_op(mq->cmdq_req_peeked) == REQ_OP_DISCARD) ||
		   (req_op(mq->cmdq_req_peeked) == REQ_OP_SECURE_ERASE))
		  && test_bit(CMDQ_STATE_DCMD_ACTIVE, &ctx->curr_state))
		&& !(!host->card->part_curr && !mmc_card_suspended(host->card)
		     && mmc_host_halt(host))
		&& !(!host->card->part_curr && mmc_host_cq_disable(host) &&
			!mmc_card_suspended(host->card))
		&& !test_bit(CMDQ_STATE_ERR, &ctx->curr_state)
		&& !atomic_read(&host->rpmb_req_pending)
		&& !mmc_check_blk_queue_start_tag(q, mq->cmdq_req_peeked))); //tj: here
}

check上面注释的no.6.

static int mmc_cmdq_thread(void *d)
{
	struct mmc_queue *mq = d;
	struct mmc_card *card = mq->card;

	struct mmc_host *host = card->host;

	current->flags |= PF_MEMALLOC;
	if (card->host->wakeup_on_idle)
		set_wake_up_idle(true);

	while (1) {
		int ret = 0;

		mmc_cmdq_ready_wait(host, mq);
		if (kthread_should_stop())
			break;

		ret = mmc_cmdq_down_rwsem(host, mq->cmdq_req_peeked);
		if (ret) {
			mmc_cmdq_up_rwsem(host);
			continue;
		}
		ret = mq->cmdq_issue_fn(mq, mq->cmdq_req_peeked);
		mmc_cmdq_up_rwsem(host);

		/*
		 * Don't requeue if issue_fn fails.
		 * Recovery will be come by completion softirq
		 * Also we end the request if there is a partition switch
		 * error, so we should not requeue the request here.
		 */
	} /* loop */

	return 0;
}

int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
		   spinlock_t *lock, const char *subname, int area_type)
{
	...
			mq->thread = kthread_run(mmc_cmdq_thread, mq,
						 "mmc-cmdqd/%d%s", //tj:here
						 host->index,
						 subname ? subname : "");
			if (IS_ERR(mq->thread)) {
				pr_err("%s: %d: cmdq: failed to start mmc-cmdqd thread\n",
					mmc_hostname(card->host), ret);
				ret = PTR_ERR(mq->thread);
			}

也就是说，cmdq状态ok后可以仍请求给HW CMDQ, rt? 在check cmdq状态时发现有tag超时了。

我们再来看下超时后的异常处理, 超时的entry: ->cmdq_req_timed_out:

static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
...
{
...
	if (card->cmdq_init) {
		md->flags |= MMC_BLK_CMD_QUEUE;
		md->queue.cmdq_complete_fn = mmc_blk_cmdq_complete_rq;
		md->queue.cmdq_issue_fn = mmc_blk_cmdq_issue_rq;
		md->queue.cmdq_error_fn = mmc_blk_cmdq_err;
		md->queue.cmdq_req_timed_out = mmc_blk_cmdq_req_timed_out;  //tj: here
		md->queue.cmdq_shutdown = mmc_blk_cmdq_shutdown;
	}

static enum blk_eh_timer_return mmc_blk_cmdq_req_timed_out(struct request *req)
{
	struct mmc_queue *mq = req->q->queuedata;
	struct mmc_host *host = mq->card->host;
	struct mmc_queue_req *mq_rq = req->special;
	struct mmc_request *mrq;
	struct mmc_cmdq_req *cmdq_req;
	struct mmc_cmdq_context_info *ctx_info = &host->cmdq_ctx;

	BUG_ON(!host);

	/*
	 * The mmc_queue_req will be present only if the request
	 * is issued to the LLD. The request could be fetched from
	 * block layer queue but could be waiting to be issued
	 * (for e.g. clock scaling is waiting for an empty cmdq queue)
	 * Reset the timer in such cases to give LLD more time
	 */
	if (!mq_rq) {
		pr_warn("%s: restart timer for tag: %d\n", __func__, req->tag);
		return BLK_EH_RESET_TIMER;
	}

	mrq = &mq_rq->cmdq_req.mrq;
	cmdq_req = &mq_rq->cmdq_req;

	BUG_ON(!mrq || !cmdq_req);

	if (cmdq_req->cmdq_req_flags & DCMD)
		mrq->cmd->error = -ETIMEDOUT;
	else
		mrq->data->error = -ETIMEDOUT;

	host->err_stats[MMC_ERR_CMDQ_REQ_TIMEOUT]++;

	if (mrq->cmd && mrq->cmd->error) {
		if (!(mrq->req->cmd_flags & REQ_PREFLUSH)) {
			/*
			 * Notify completion for non flush commands like
			 * discard that wait for DCMD finish.
			 */
			set_bit(CMDQ_STATE_REQ_TIMED_OUT,
					&ctx_info->curr_state);
			complete(&mrq->completion);
			return BLK_EH_NOT_HANDLED;
		}
	}

	if (test_bit(CMDQ_STATE_REQ_TIMED_OUT, &ctx_info->curr_state) ||
		test_bit(CMDQ_STATE_ERR, &ctx_info->curr_state))
		return BLK_EH_NOT_HANDLED;

	set_bit(CMDQ_STATE_REQ_TIMED_OUT, &ctx_info->curr_state);
	return BLK_EH_HANDLED;
}

先是记录为超时-ETIMEDOUT, 然后把curr_state标记CMDQ_STATE_REQ_TIMED_OUT。

static void blk_rq_timed_out(struct request *req)
{
        struct request_queue *q = req->q;
        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;

        if (q->rq_timed_out_fn)
                ret = q->rq_timed_out_fn(req);  //tj: here
        switch (ret) {
        case BLK_EH_HANDLED:
                __blk_complete_request(req); //tj: here

block/blk-softirq.c:

void __blk_complete_request(struct request *req)
{
	...
        if (ccpu == cpu || shared) {
                struct list_head *list;
do_local:
                list = this_cpu_ptr(&blk_cpu_done);
                list_add_tail(&req->ipi_list, list);

                /*
                 * if the list only contains our just added request,
                 * signal a raise of the softirq. If there are already
                 * entries there, someone already raised the irq but it
                 * hasn't run yet.
                 */
                if (list->next == &req->ipi_list)
                        raise_softirq_irqoff(BLOCK_SOFTIRQ); //tj: here
        } else if (raise_blk_irq(ccpu, req))
                goto do_local;
	...
}

softirq有关，should be raise_softirq_irqoff(), 我们看下mmc drv:

int mmc_cmdq_init(struct mmc_queue *mq, struct mmc_card *card)
{
	...
	blk_queue_softirq_done(mq->queue, mmc_cmdq_softirq_done);
	INIT_WORK(&mq->cmdq_err_work, mmc_cmdq_error_work);

void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
{
	q->softirq_done_fn = fn;
}

blk_done_softirq() -> mmc_cmdq_softirq_done (->softirq_done_fn)

static __init int blk_softirq_init(void)
{
        int i;

        for_each_possible_cpu(i)
                INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));

        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);

ok, timerout error handler最后mmc_cmdq_softirq_done():

static void mmc_cmdq_softirq_done(struct request *rq)
{
	struct mmc_queue *mq = rq->q->queuedata;

	mq->cmdq_complete_fn(rq); //tj: mmc_blk_cmdq_complete_rq(), check above
}

/* invoked by block layer in softirq context */
void mmc_blk_cmdq_complete_rq(struct request *rq)
{
	...
	//tj: 先取得error
	if (mrq->cmd && mrq->cmd->error)
		err = mrq->cmd->error;
	else if (mrq->data && mrq->data->error)
		err = mrq->data->error;
	if (cmdq_req->resp_err)
		err_resp = cmdq_req->resp_err;

	//tj: 然后check err
	if ((err || err_resp) && !cmdq_req->skip_err_handling) {
		pr_err("%s: %s: txfr error(%d)/resp_err(%d)\n",
				mmc_hostname(mrq->host), __func__, err,
				err_resp);
		if (test_bit(CMDQ_STATE_ERR, &ctx_info->curr_state)) {
			pr_err("%s: CQ in error state, ending current req: %d\n",
				__func__, err);
		} else {
			set_bit(CMDQ_STATE_ERR, &ctx_info->curr_state);
			BUG_ON(host->err_mrq != NULL);
			host->err_mrq = mrq;
			schedule_work(&mq->cmdq_err_work); //tj: here
		}
		goto out;
	}	
}

因为我们->curr_state是CMDQ_STATE_REQ_TIMED_OUT，check ->cmdq_err_work:

static void mmc_cmdq_error_work(struct work_struct *work)
{
	struct mmc_queue *mq = container_of(work, struct mmc_queue,
					    cmdq_err_work);

	mq->cmdq_error_fn(mq);
}

/*
 * mmc_blk_cmdq_err: error handling of cmdq error requests.
 * Function should be called in context of error out request
 * which has claim_host and rpm acquired.
 * This may be called with CQ engine halted. Make sure to
 * unhalt it after error recovery.
 *
 * TODO: Currently cmdq error handler does reset_all in case
 * of any erorr. Need to optimize error handling.
 */
static void mmc_blk_cmdq_err(struct mmc_queue *mq)
{
	...
	pr_err("%s: %s Starting cmdq Error handler\n",
		mmc_hostname(host), __func__);
	q = mrq->req->q;
	err = mmc_cmdq_halt(host, true); //tj: 先halt cmdq
	if (err) {
		pr_err("halt: failed: %d\n", err);
		goto reset;
	}
	...

	//tj: 当timeout时获取device status
	/*
	 * TIMEOUT errrors can happen because of execution error
	 * in the last command. So send cmd 13 to get device status
	 */
	if ((mrq->cmd && (mrq->cmd->error == -ETIMEDOUT)) ||
			(mrq->data && (mrq->data->error == -ETIMEDOUT))) {
		if (mmc_host_halt(host) || mmc_host_cq_disable(host)) {
			ret = get_card_status(host->card, &status, 0);
			if (ret)
				pr_err("%s: CMD13 failed with err %d\n",
						mmc_hostname(host), ret);
		}
		pr_err("%s: Timeout error detected with device status 0x%08x\n",
			mmc_hostname(host), status);
	}

	/*
	 * In case of software request time-out, we schedule err work only for
	 * the first error out request and handles all other request in flight
	 * here.
	 */
	if (test_bit(CMDQ_STATE_REQ_TIMED_OUT, &ctx_info->curr_state)) {
		err = -ETIMEDOUT;
	} else if (mrq->data && mrq->data->error) {
		err = mrq->data->error;
	} else if (mrq->cmd && mrq->cmd->error) {
		/* DCMD commands */
		err = mrq->cmd->error;
	}

reset:
	mmc_blk_cmdq_reset_all(host, err); //tj: reset all
	if (mrq->cmdq_req->resp_err)
		mrq->cmdq_req->resp_err = false;
	mmc_cmdq_halt(host, false); //tj: unhalt cmdq

	host->err_mrq = NULL;

	//tj: clear CMDQ_STATE_REQ_TIMED_OUT/CMDQ_STATE_ERR
	clear_bit(CMDQ_STATE_REQ_TIMED_OUT, &ctx_info->curr_state);
	WARN_ON(!test_and_clear_bit(CMDQ_STATE_ERR, &ctx_info->curr_state));

来看mmc_blk_cmdq_reset_all():

/**
 * mmc_blk_cmdq_reset_all - Reset everything for CMDQ block request.
 * @host:	mmc_host pointer.
 * @err:	error for which reset is performed.
 *
 * This function implements reset_all functionality for
 * cmdq. It resets the controller, power cycle the card,
 * and invalidate all busy tags(requeue all request back to
 * elevator).
 */
static void mmc_blk_cmdq_reset_all(struct mmc_host *host, int err)
{

看注释是复位controller，掉电复位eMMC，还有就是清理busy tags。call stack:

mmc_blk_cmdq_reset(,false) -> mmc_cmdq_hw_reset()

/*
 * mmc_cmdq_hw_reset: Helper API for doing
 * reset_all of host and reinitializing card.
 * This must be called with mmc_claim_host
 * acquired by the caller.
 */
int mmc_cmdq_hw_reset(struct mmc_host *host)
{
	if (!host->bus_ops->reset)
		return -EOPNOTSUPP;

	return host->bus_ops->reset(host); //tj: mmc_reset()
}

static int mmc_reset(struct mmc_host *host)
{
	struct mmc_card *card = host->card;
	int ret;

	if ((host->caps & MMC_CAP_HW_RESET) && host->ops->hw_reset &&
	     mmc_can_reset(card)) {
		mmc_host_clk_hold(host);
		/* If the card accept RST_n signal, send it. */
		mmc_set_clock(host, host->f_init);
		host->ops->hw_reset(host);
		/* Set initial state and call mmc_set_ios */
		mmc_set_initial_state(host);
		mmc_host_clk_release(host);
	} else {
		/* Do a brute force power cycle */
		mmc_power_cycle(host, card->ocr);
		mmc_pwrseq_reset(host);
	}

	if (host->inlinecrypt_support)
		host->inlinecrypt_reset_needed = true;

	ret = mmc_init_card(host, host->card->ocr, host->card);
	if (ret) {
		pr_err("%s: %s: mmc_init_card failed (%d)\n",
			mmc_hostname(host), __func__, ret);
		return ret;
	}

	return ret;
}

主要看host有没有实现reset，如果有直接发复位信号给eMMC。如果没有就强迫power cycle(掉电)。最后然后走软复位card(mmc_init_card())。

refer

JEDEC STANDARD JESD84-B51.pdf