参考LA msm-4.14 kernel,先看下在drivers/soc/qcom/subsys-pil-tz.c里记录失败原因的接口:

static void log_failure_reason(const struct pil_tz_data *d)
{
size_t size;
char *smem_reason, reason[MAX_SSR_REASON_LEN];
const char *name = d->subsys_desc.name;

if (d->smem_id == -1)
return;

smem_reason = qcom_smem_get(QCOM_SMEM_HOST_ANY, d->smem_id, &size); //tj
if (IS_ERR(smem_reason) || !size) {
pr_err("%s SFR: (unknown, qcom_smem_get failed).\n",
name);
return;
}
if (!smem_reason[0]) {
pr_err("%s SFR: (unknown, empty string found).\n", name);
return;
}

strlcpy(reason, smem_reason, min(size, (size_t)MAX_SSR_REASON_LEN)); //tj
pr_err("%s subsystem failure reason: %s.\n", name, reason);
}

调用qcom_smem_get()获得subsystem复位原因,记录在栈reason中。

ok,既然是stack var,func结束就丢了。如何保存这个reason?因为我们不可能老是连着串口,有的crash不一定能复现,pstore有时没有记录,尤其是用户退货机分析。

在调试阶段,我想一个有效方法可以把这个reason放到保留memory里。对退货机,就要保存到存储上了,直接从fs层写到block设备上就行了。看了下pstore next加了block支持(全志贡献的),回头看看。这里我们主要看下如何保存到存储上。

直接在这个接口里加blk读写是否可以?我们来看代码上下文。

static irqreturn_t subsys_err_fatal_intr_handler (int irq, void *dev_id)
{
struct pil_tz_data *d = subsys_to_data(dev_id);

pr_err("Fatal error on %s!\n", d->subsys_desc.name);
if (subsys_get_crash_status(d->subsys)) {
pr_err("%s: Ignoring error fatal, restart in progress\n",
d->subsys_desc.name);
return IRQ_HANDLED;
}
subsys_set_crash_status(d->subsys, CRASH_STATUS_ERR_FATAL);
log_failure_reason(d); //tj: here
subsystem_restart_dev(d->subsys);

return IRQ_HANDLED;
}
static irqreturn_t subsys_wdog_bite_irq_handler(int irq, void *dev_id)
{
struct pil_tz_data *d = subsys_to_data(dev_id);

if (subsys_get_crash_status(d->subsys))
return IRQ_HANDLED;
pr_err("Watchdog bite received from %s!\n", d->subsys_desc.name);

if (d->subsys_desc.system_debug)
panic("%s: System ramdump requested. Triggering device restart!\n",
__func__);
subsys_set_crash_status(d->subsys, CRASH_STATUS_WDOG_BITE);
log_failure_reason(d); //tj: here
subsystem_restart_dev(d->subsys);

return IRQ_HANDLED;
}
static void clear_wdog(struct pil_tz_data *d)
{
/* Check crash status to know if device is restarting*/
if (!subsys_get_crash_status(d->subsys)) {
pr_err("wdog bite received from %s!\n", d->subsys_desc.name);
__raw_writel(BIT(d->bits_arr[ERR_READY]), d->irq_clear);
subsys_set_crash_status(d->subsys, CRASH_STATUS_WDOG_BITE);
log_failure_reason(d); //tj:here
subsystem_restart_dev(d->subsys);
}
}
static irqreturn_t subsys_generic_handler(int irq, void *dev_id)
{
struct pil_tz_data *d = subsys_to_data(dev_id);
uint32_t status_val, err_value;

err_value = __raw_readl(d->err_status_spare);
status_val = __raw_readl(d->irq_status);

if ((status_val & BIT(d->bits_arr[ERR_READY])) && !err_value)
clear_err_ready(d);

if ((status_val & BIT(d->bits_arr[ERR_READY])) &&
err_value == 0x44554d50)
clear_wdog(d);

ok, 是在中断服务里call log_failure_reason()

static int pil_tz_driver_probe(struct platform_device *pdev)
{
...
if (of_property_read_bool(pdev->dev.of_node,
"qcom,pil-generic-irq-handler")) {
d->subsys_desc.generic_handler = subsys_generic_handler;
...
} else {
d->subsys_desc.err_fatal_handler =
subsys_err_fatal_intr_handler;
d->subsys_desc.wdog_bite_handler = subsys_wdog_bite_irq_handler;
}

一般都走Fatal error or Watchdog bite received

首先设置crash状态:

subsys_set_crash_status(d->subsys, CRASH_STATUS_ERR_FATAL);
subsys_set_crash_status(d->subsys, CRASH_STATUS_WDOG_BITE);
void subsys_set_crash_status(struct subsys_device *dev,
enum crash_status crashed)
{
dev->crashed = crashed;
}

然后call log_failure_reason()记录,中断服务最后call subsystem_restart_dev(),看下这个restart接口:

int subsystem_restart_dev(struct subsys_device *dev)
{
const char *name;

if (!get_device(&dev->dev))
return -ENODEV;

if (!try_module_get(dev->owner)) {
put_device(&dev->dev);
return -ENODEV;
}

name = dev->desc->name;

send_early_notifications(dev->early_notify);

/*
* If a system reboot/shutdown is underway, ignore subsystem errors.
* However, print a message so that we know that a subsystem behaved
* unexpectedly here.
*/
if (system_state == SYSTEM_RESTART
|| system_state == SYSTEM_POWER_OFF) {
pr_err("%s crashed during a system poweroff/shutdown.\n", name);
return -EBUSY; //tj
}

pr_info("Restart sequence requested for %s, restart_level = %s.\n",
name, restart_levels[dev->restart_level]);

if (disable_restart_work == DISABLE_SSR) {
pr_warn("subsys-restart: Ignoring restart request for %s\n",
name);
return 0;
}

switch (dev->restart_level) {

case RESET_SUBSYS_COUPLED:
__subsystem_restart_dev(dev);
break;
case RESET_SOC:
__pm_stay_awake(&dev->ssr_wlock);
schedule_work(&dev->device_restart_work);
return 0;
default:
panic("subsys-restart: Unknown restart level!\n");
break;
}
module_put(dev->owner);
put_device(&dev->dev);

return 0;
}

先看系统状态如果reboot/shutdown进行中就认为忙,退出。如果禁用了SSR,直接返回。下来是主要功能,按->restart_level走不同的流程,这两个level定义:

static const char * const restart_levels[] = {
[RESET_SOC] = "SYSTEM",
[RESET_SUBSYS_COUPLED] = "RELATED",
};

先看RESET_SOC,从字面看就是系统级的了:

case RESET_SOC:
__pm_stay_awake(&dev->ssr_wlock);
schedule_work(&dev->device_restart_work);
return 0;

struct subsys_device *subsys_register(struct subsys_desc *desc)
{
...
INIT_WORK(&subsys->device_restart_work, device_restart_work_hdlr);

/**
* struct subsys_device - subsystem device
...
* @device_restart_work: work struct for device restart
...
*/
struct subsys_device {
struct subsys_desc *desc;
struct work_struct work;
struct wakeup_source ssr_wlock;
char wlname[64];
struct work_struct device_restart_work;

ok, 进work task device_restart_work_hdlr()看下:

static void device_restart_work_hdlr(struct work_struct *work)
{
struct subsys_device *dev = container_of(work, struct subsys_device,
device_restart_work);

notify_each_subsys_device(&dev, 1, SUBSYS_SOC_RESET, NULL);
/*
* Temporary workaround until ramdump userspace application calls
* sync() and fclose() on attempting the dump.
*/
msleep(100);
panic("subsys-restart: Resetting the SoC - %s crashed.",
dev->desc->name);
}

call notify_each_subsys_device()通知每个子系统复位:

static void notify_each_subsys_device(struct subsys_device **list,
unsigned int count,
enum subsys_notif_type notif, void *data)
{
struct subsys_device *subsys;

while (count--) {
struct subsys_device *dev = *list++;
struct notif_data notif_data;
struct platform_device *pdev;

if (!dev)
continue;

pdev = container_of(dev->desc->dev, struct platform_device,
dev);
dev->notif_state = notif;

mutex_lock(&subsys_list_lock);
list_for_each_entry(subsys, &subsys_list, list)
if (dev != subsys &&
subsys->track.state == SUBSYS_ONLINE) {
setup_timeout(dev->desc, subsys->desc,
SUBSYS_TO_SUBSYS_SYSMON);
sysmon_send_event(subsys->desc, dev->desc,
notif); //tj
cancel_timeout(dev->desc);
}
mutex_unlock(&subsys_list_lock);

if (notif == SUBSYS_AFTER_POWERUP &&
dev->track.state == SUBSYS_ONLINE)
send_sysmon_notif(dev);

notif_data.crashed = subsys_get_crash_status(dev);
notif_data.enable_ramdump = is_ramdump_enabled(dev); //tj
notif_data.enable_mini_ramdumps = enable_mini_ramdumps;
notif_data.no_auth = dev->desc->no_auth;
notif_data.pdev = pdev;

trace_pil_notif("before_send_notif", notif, dev->desc->fw_name);
setup_timeout(dev->desc, NULL, SUBSYS_TO_HLOS);
subsys_notif_queue_notification(dev->notify, notif,
&notif_data);
cancel_timeout(dev->desc);
trace_pil_notif("after_send_notif", notif, dev->desc->fw_name);
subsys_notif_uevent(dev->desc, notif);
}
}

call list_for_each_entry()去遍历每一个在线子系统,notif_data包括是否使能enable_ramdumpenable_mini_ramdumps。如果使能,应该就去dump ram了。

so, 才能有:

/*
* Temporary workaround until ramdump userspace application calls
* sync() and fclose() on attempting the dump.
*/
msleep(100);

am i right? 最后call kernel panic()。

ok, 我们再看另一个level: RELATED

case RESET_SUBSYS_COUPLED:
__subsystem_restart_dev(dev);
break;
static void __subsystem_restart_dev(struct subsys_device *dev)
{
struct subsys_desc *desc = dev->desc;
const char *name = dev->desc->name;
struct subsys_tracking *track;
unsigned long flags;

pr_debug("Restarting %s [level=%s]!\n", desc->name,
restart_levels[dev->restart_level]);

track = subsys_get_track(dev);
/*
* Allow drivers to call subsystem_restart{_dev}() as many times as
* they want up until the point where the subsystem is shutdown.
*/
spin_lock_irqsave(&track->s_lock, flags);
if (track->p_state != SUBSYS_CRASHED &&
dev->track.state == SUBSYS_ONLINE) {
if (track->p_state != SUBSYS_RESTARTING) {
track->p_state = SUBSYS_CRASHED;
__pm_stay_awake(&dev->ssr_wlock);
queue_work(ssr_wq, &dev->work); //tj
} else {
panic("Subsystem %s crashed during SSR!", name);
}
} else
WARN(dev->track.state == SUBSYS_OFFLINE,
"SSR aborted: %s subsystem not online\n", name);
spin_unlock_irqrestore(&track->s_lock, flags);
}
INIT_WORK(&subsys->work, subsystem_restart_wq_func);

check subsystem_restart_wq_func():

static void subsystem_restart_wq_func(struct work_struct *work)
{
...
pr_debug("[%s:%d]: Starting restart sequence for %s\n",
current->comm, current->pid, desc->name);
notify_each_subsys_device(list, count, SUBSYS_BEFORE_SHUTDOWN, NULL);
ret = for_each_subsys_device(list, count, NULL, subsystem_shutdown);
if (ret)
goto err;
notify_each_subsys_device(list, count, SUBSYS_AFTER_SHUTDOWN, NULL);

notify_each_subsys_device(list, count, SUBSYS_RAMDUMP_NOTIFICATION,
NULL);

spin_lock_irqsave(&track->s_lock, flags);
track->p_state = SUBSYS_RESTARTING;
spin_unlock_irqrestore(&track->s_lock, flags);

/* Collect ram dumps for all subsystems in order here */
for_each_subsys_device(list, count, NULL, subsystem_ramdump);

for_each_subsys_device(list, count, NULL, subsystem_free_memory);

notify_each_subsys_device(list, count, SUBSYS_BEFORE_POWERUP, NULL);
ret = for_each_subsys_device(list, count, NULL, subsystem_powerup);
if (ret)
goto err;
notify_each_subsys_device(list, count, SUBSYS_AFTER_POWERUP, NULL);

pr_info("[%s:%d]: Restart sequence for %s completed.\n",
current->comm, current->pid, desc->name);
...


static int for_each_subsys_device(struct subsys_device **list,
unsigned int count, void *data,
int (*fn)(struct subsys_device *, void *))
{
int ret;

while (count--) {
struct subsys_device *dev = *list++;

if (!dev)
continue;
ret = fn(dev, data);
if (ret)
return ret;
}
return 0;
}

能看到这个接口在做ramdump:

static int subsystem_ramdump(struct subsys_device *dev, void *data)
{
const char *name = dev->desc->name;

if (dev->desc->ramdump)
if (dev->desc->ramdump(is_ramdump_enabled(dev), dev->desc) < 0)
pr_warn("%s[%s:%d]: Ramdump failed.\n",
name, current->comm, current->pid);
dev->do_ramdump_on_put = false;
return 0;
}

可以看到,他没有像SYSTEM level那样直接call kernel panic。也就是所谓的subsystem restart?

好了,到这里应该知道在哪里加入复位原因到block device了。也可以参考下面的修改: