pstore ramoops overview | TJ的技术博客

pstore就是persistent store，是一种内存文件系统，提供了一种机制用来存储一些有用的信息，最初是给带non-volatile storage的设备使用，用来debug system crash，存储的是kernel panic/oops日志，后面引入了ramoops作为backend，加入了存储kernel console log，Android平台又加入存储了user-space的pmsg。

ok，下面具体分析，参考代码是kernel 4.9：

先看下pstore内核配置：

obj-$(CONFIG_PSTORE) += pstore.o

pstore-objs += inode.o platform.o
pstore-$(CONFIG_PSTORE_FTRACE)  += ftrace.o

pstore-$(CONFIG_PSTORE_PMSG)    += pmsg.o

ramoops-objs += ram.o ram_core.o
obj-$(CONFIG_PSTORE_RAM)        += ramoops.o

CONFIG_PSTORE机制相关代码：inode.c + platform.c，看下配置说明：

config PSTORE
        tristate "Persistent store support"
        default n
        help
           This option enables generic access to platform level
           persistent storage via "pstore" filesystem that can 
           be mounted as /dev/pstore.  Only useful if you have
           a platform level driver that registers with pstore to
           provide the data, so you probably should just go say "Y" 
           (or "M") to a platform specific persistent store driver
           (e.g. ACPI_APEI on X86) which will select this for you.
           If you don't have a platform persistent store driver,
           say N.

这里Only useful说的就是开启pstore必须要提供一个persistent store driver，比如ACPI_APEI on X86，这个就是最初的non-volatile storage driver，代码路径在：drivers/acpi/apei/erst.c。

而现在取代的基本都是ramoops driver，也就是CONFIG_PSTORE_RAM：

config PSTORE_RAM
        tristate "Log panic/oops to a RAM buffer"
        depends on PSTORE
        depends on HAS_IOMEM
        depends on HAVE_MEMBLOCK
        select REED_SOLOMON
        select REED_SOLOMON_ENC8
        select REED_SOLOMON_DEC8
        help
          This enables panic and oops messages to be logged to a circular
          buffer in RAM where it can be read back at some later point.

          Note that for historical reasons, the module will be named
          "ramoops.ko".

          For more information, see Documentation/ramoops.txt.

相关代码：ram.c + ram_core.c。

CONFIG_PSTORE + CONFIG_PSTORE_RAM是核心框代码，CONFIG_PSTORE_CONSOLE主要是不管是不是crash都保存all kernel message：

config PSTORE_PMSG
        bool "Log user space messages"
        depends on PSTORE
        help
          When the option is enabled, pstore will export a character
          interface /dev/pmsg0 to log user space messages. On reboot
          data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].

          If unsure, say N.

下面主要看下这3个宏相关代码。

pstore文件系统位置在：

xxx:/ # ls /sys/fs/pstore
console-ramoops-0 dmesg-ramoops-0

console开头就是all kernel message，而dmesg开头的就是crash记录的了，代码在inode.c pstore_mkfile里：

/*
 * Make a regular file in the root directory of our file system.
 * Load it up with "size" bytes of data from "buf".
 * Set the mtime & ctime to the date that this record was originally stored.
 */
int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
		  char *data, bool compressed, size_t size,
		  struct timespec time, struct pstore_info *psi)
{
	...
	switch (type) {
	case PSTORE_TYPE_DMESG: //tj: crash log
		scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
			  psname, id, compressed ? ".enc.z" : "");
		break;
	case PSTORE_TYPE_CONSOLE: //tj: all kernel messages
		scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
		break;

ramoops负责把message write到某个ram区域上，platform负责从ram读取存到/sys/fs/pstore，ok，先来看机制代码platform.c：

backend需要用pstore_register来注册：

/*
 * platform specific persistent storage driver registers with
 * us here. If pstore is already mounted, call the platform
 * read function right away to populate the file system. If not
 * then the pstore mount code will call us later to fill out
 * the file system.
 */
int pstore_register(struct pstore_info *psi)
{
	struct module *owner = psi->owner;

	if (backend && strcmp(backend, psi->name))
		return -EPERM;

	spin_lock(&pstore_lock);
	if (psinfo) {
		spin_unlock(&pstore_lock);
		return -EBUSY;
	}

	if (!psi->write)
		psi->write = pstore_write_compat;
	if (!psi->write_buf_user)
		psi->write_buf_user = pstore_write_buf_user_compat;
	psinfo = psi;
	mutex_init(&psinfo->read_mutex);
	spin_unlock(&pstore_lock);
	...
	/*
	 * Update the module parameter backend, so it is visible
	 * through /sys/module/pstore/parameters/backend
	 */
	backend = psi->name;

	module_put(owner);

	pr_info("Registered %s as persistent store backend\n", psi->name);

/*
 * pstore_lock just protects "psinfo" during
 * calls to pstore_register()
 */
static DEFINE_SPINLOCK(pstore_lock);
struct pstore_info *psinfo;

backend判断确保一次只能有一个并记录了全局psinfo。

看下结构体pstore_info:

struct pstore_info {
	struct module	*owner;
	char		*name;
	spinlock_t	buf_lock;	/* serialize access to 'buf' */
	char		*buf;
	size_t		bufsize;
	struct mutex	read_mutex;	/* serialize open/read/close */
	int		flags;
	int		(*open)(struct pstore_info *psi);
	int		(*close)(struct pstore_info *psi);
	ssize_t		(*read)(u64 *id, enum pstore_type_id *type,
			int *count, struct timespec *time, char **buf,
			bool *compressed, ssize_t *ecc_notice_size,
			struct pstore_info *psi);
	int		(*write)(enum pstore_type_id type,
			enum kmsg_dump_reason reason, u64 *id,
			unsigned int part, int count, bool compressed,
			size_t size, struct pstore_info *psi);
	int		(*write_buf)(enum pstore_type_id type,
			enum kmsg_dump_reason reason, u64 *id,
			unsigned int part, const char *buf, bool compressed,
			size_t size, struct pstore_info *psi);
	int		(*write_buf_user)(enum pstore_type_id type,
			enum kmsg_dump_reason reason, u64 *id,
			unsigned int part, const char __user *buf,
			bool compressed, size_t size, struct pstore_info *psi);
	int		(*erase)(enum pstore_type_id type, u64 id,
			int count, struct timespec time,
			struct pstore_info *psi);
	void		*data;
};

name就是backend的name了。

*write和*write_buf_user如果backend没有给出会有个默认compat func，最终都走的*write_buf。

if (!psi->write)
	psi->write = pstore_write_compat;
if (!psi->write_buf_user)
	psi->write_buf_user = pstore_write_buf_user_compat;

static int pstore_write_compat(enum pstore_type_id type,
			       enum kmsg_dump_reason reason,
			       u64 *id, unsigned int part, int count,
			       bool compressed, size_t size,
			       struct pstore_info *psi)
{
	return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
			     size, psi);
}

static int pstore_write_buf_user_compat(enum pstore_type_id type,
			       enum kmsg_dump_reason reason,
			       u64 *id, unsigned int part,
			       const char __user *buf,
			       bool compressed, size_t size,
			       struct pstore_info *psi)
{
...
		ret = psi->write_buf(type, reason, id, part, psinfo->buf,
...
}

继续pstore注册：

if (pstore_is_mounted())
	pstore_get_records(0);

如果pstore已经mounted，那就创建并填充文件by pstore_get_records:

/*
 * Read all the records from the persistent store. Create
 * files in our filesystem.  Don't warn about -EEXIST errors
 * when we are re-scanning the backing store looking to add new
 * error records.
 */
void pstore_get_records(int quiet)
{
	struct pstore_info *psi = psinfo; //tj: global psinfo
	...
	mutex_lock(&psi->read_mutex);
	if (psi->open && psi->open(psi))
		goto out;

	while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
				 &ecc_notice_size, psi)) > 0) {
		if (compressed && (type == PSTORE_TYPE_DMESG)) {
			if (big_oops_buf)
				unzipped_len = pstore_decompress(buf,
							big_oops_buf, size,
							big_oops_buf_sz);

			if (unzipped_len > 0) {
				if (ecc_notice_size)
					memcpy(big_oops_buf + unzipped_len,
					       buf + size, ecc_notice_size);
				kfree(buf);
				buf = big_oops_buf;
				size = unzipped_len;
				compressed = false;
			} else {
				pr_err("decompression failed;returned %d\n",
				       unzipped_len);
				compressed = true;
			}
		}
		rc = pstore_mkfile(type, psi->name, id, count, buf,
				   compressed, size + ecc_notice_size,
				   time, psi);
		if (unzipped_len < 0) {
			/* Free buffer other than big oops */
			kfree(buf);
			buf = NULL;
		} else
			unzipped_len = -1;
		if (rc && (rc != -EEXIST || !quiet))
			failed++;
	}
	if (psi->close)
		psi->close(psi);
out:
	mutex_unlock(&psi->read_mutex);

if needed，call pstore_decompress解压然后创建pstore文件by vfs接口pstore_mkfile。

pstore注册接下来是按类别分别注册：

if (psi->flags & PSTORE_FLAGS_DMESG)
	pstore_register_kmsg();
if (psi->flags & PSTORE_FLAGS_CONSOLE)
	pstore_register_console();
if (psi->flags & PSTORE_FLAGS_FTRACE)
	pstore_register_ftrace();
if (psi->flags & PSTORE_FLAGS_PMSG)
	pstore_register_pmsg();

psi->flags仍是由backend决定，只看pstore_register_kmsg和pstore_register_console。

pstore panic log注册：

static struct kmsg_dumper pstore_dumper = {
	.dump = pstore_dump,
};

/*
 * Register with kmsg_dump to save last part of console log on panic.
 */
static void pstore_register_kmsg(void)
{
	kmsg_dump_register(&pstore_dumper);
}

pstore_dump最终会call backend的write，直接用全局psinfo。

/*
 * callback from kmsg_dump. (s2,l2) has the most recently
 * written bytes, older bytes are in (s1,l1). Save as much
 * as we can from the end of the buffer.
 */
static void pstore_dump(struct kmsg_dumper *dumper,
			enum kmsg_dump_reason reason)
{
	...
		ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
				    oopscount, compressed, total_len, psinfo);

kmsg_dump_register是内核一种增加log dumper方法，called when kernel oopses or panic。

static LIST_HEAD(dump_list);

/**
 * kmsg_dump_register - register a kernel log dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Adds a kernel log dumper to the system. The dump callback in the
 * structure will be called when the kernel oopses or panics and must be
 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
 */
int kmsg_dump_register(struct kmsg_dumper *dumper)
{

/**
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 *
 * Call each of the registered dumper's dump() callback, which can
 * retrieve the kmsg records with kmsg_dump_get_line() or
 * kmsg_dump_get_buffer().
 */
void kmsg_dump(enum kmsg_dump_reason reason)
{
        list_for_each_entry_rcu(dumper, &dump_list, list) {
	...
                /* invoke dumper which will iterate over records */
                dumper->dump(dumper, reason);  //tj: call pstore_dump

such as panic:

/**
 *      panic - halt the system
 *      @fmt: The text string to print
 *
 *      Display a message, then perform cleanups.
 *
 *      This function never returns.
 */
void panic(const char *fmt, ...)
{
	...
        /* Call flush even twice. It tries harder with a single online CPU */
        printk_nmi_flush_on_panic();
        kmsg_dump(KMSG_DUMP_PANIC);

pstore console 注册：

static struct console pstore_console = {
	.name	= "pstore",
	.write	= pstore_console_write,
	.flags	= CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME,
	.index	= -1,
};

static void pstore_register_console(void)
{
	register_console(&pstore_console);
}

->write最终也会call backend write:

#ifdef CONFIG_PSTORE_CONSOLE
static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
	const char *e = s + c;

	while (s < e) {
		unsigned long flags;
		u64 id;

		if (c > psinfo->bufsize)
			c = psinfo->bufsize;

		if (oops_in_progress) {
			if (!spin_trylock_irqsave(&psinfo->buf_lock, flags))
				break;
		} else {
			spin_lock_irqsave(&psinfo->buf_lock, flags);
		}
		memcpy(psinfo->buf, s, c);
		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);  // tj: here
		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
		s += c;
		c = e - s;
	}
}

ok。下面来看下RAM backend: ramoops，先看probe:

static int ramoops_probe(struct platform_device *pdev)
{
	struct device *dev = &pdev->dev;
	struct ramoops_platform_data *pdata = dev->platform_data;
	...

	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
			!pdata->ftrace_size && !pdata->pmsg_size)) {
		pr_err("The memory size and the record/console size must be "
			"non-zero\n");
		goto fail_out;
	}
	...
	
	cxt->size = pdata->mem_size;
	cxt->phys_addr = pdata->mem_address;
	cxt->memtype = pdata->mem_type;
	cxt->record_size = pdata->record_size;
	cxt->console_size = pdata->console_size;
	cxt->ftrace_size = pdata->ftrace_size;
	cxt->pmsg_size = pdata->pmsg_size;
	cxt->dump_oops = pdata->dump_oops;
	cxt->ecc_info = pdata->ecc_info;

pdata应该来源ramoops_register_dummy:

static void ramoops_register_dummy(void)
{
	...
	pr_info("using module parameters\n");

	dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL);
	if (!dummy_data) {
		pr_info("could not allocate pdata\n");
		return;
	}

	dummy_data->mem_size = mem_size;
	dummy_data->mem_address = mem_address;
	dummy_data->mem_type = mem_type;
	dummy_data->record_size = record_size;
	dummy_data->console_size = ramoops_console_size;
	dummy_data->ftrace_size = ramoops_ftrace_size;
	dummy_data->pmsg_size = ramoops_pmsg_size;
	dummy_data->dump_oops = dump_oops;
	/*
	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
	 * (using 1 byte for ECC isn't much of use anyway).
	 */
	dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;

	dummy = platform_device_register_data(NULL, "ramoops", -1,
			dummy_data, sizeof(struct ramoops_platform_data));

有几个可配参数:

/*
 * Ramoops platform data
 * @mem_size	memory size for ramoops
 * @mem_address	physical memory address to contain ramoops
 */

struct ramoops_platform_data {
	unsigned long	mem_size;  
	phys_addr_t	mem_address; 
	unsigned int	mem_type;
	unsigned long	record_size;
	unsigned long	console_size;
	unsigned long	ftrace_size;
	unsigned long	pmsg_size;
	int		dump_oops;
	struct persistent_ram_ecc_info ecc_info;
};

有个结构表示了ramoops的context:

struct ramoops_context {
	struct persistent_ram_zone **przs;
	struct persistent_ram_zone *cprz;
	struct persistent_ram_zone *fprz;
	struct persistent_ram_zone *mprz;
	phys_addr_t phys_addr;
	unsigned long size;
	unsigned int memtype;
	size_t record_size;
	size_t console_size;
	size_t ftrace_size;
	size_t pmsg_size;
	int dump_oops;
	struct persistent_ram_ecc_info ecc_info;
	unsigned int max_dump_cnt;
	unsigned int dump_write_cnt;
	/* _read_cnt need clear on ramoops_pstore_open */
	unsigned int dump_read_cnt;
	unsigned int console_read_cnt;
	unsigned int ftrace_read_cnt;
	unsigned int pmsg_read_cnt;
	struct pstore_info pstore;
};

在ramoops_probe时也是把ramoops_platform_data的成员赋给了context对应的。要了解具体含义，继续probe:

paddr = cxt->phys_addr;

dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
		- cxt->pmsg_size;
err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
if (err)
	goto fail_out;

err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr,
		       cxt->console_size, 0);
if (err)
	goto fail_init_cprz;

err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
		       LINUX_VERSION_CODE);
if (err)
	goto fail_init_fprz;

err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
if (err)
	goto fail_init_mprz;

cxt->pstore.data = cxt;

可见，是逐个init每个persistant ram zone，size一共有4段：

dump_mem_sz + cxt->console_size + cxt->ftrace_size + cxt->pmsg_size = cxt->size

so mem_size就是总大小了，mem_address是ramoops的物理地址，record_size再看下oops/panic ram：

static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
			     phys_addr_t *paddr, size_t dump_mem_sz)
{
	int err = -ENOMEM;
	int i;

	if (!cxt->record_size)
		return 0;

	if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
		dev_err(dev, "no room for dumps\n");
		return -ENOMEM;
	}

	cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
	if (!cxt->max_dump_cnt)
		return -ENOMEM;

ok dump_mem_size大小的区域分成max_dump_cnt个，每个记录大小是record_size。

接着会call persistent_ram_new来分配内存给这个ram zone。

for (i = 0; i < cxt->max_dump_cnt; i++) {
	cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
					  &cxt->ecc_info,
					  cxt->memtype, 0);

console/ftrace/pmsg ram zone同上分配。

最后处理flags并注册pstore:

cxt->pstore.flags = PSTORE_FLAGS_DMESG; //tj: 默认dump oops/panic
if (cxt->console_size)
	cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
if (cxt->ftrace_size)
	cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
if (cxt->pmsg_size)
	cxt->pstore.flags |= PSTORE_FLAGS_PMSG;

err = pstore_register(&cxt->pstore);
if (err) {
	pr_err("registering with pstore failed\n");
	goto fail_buf;
}

来看下ramoops pstore的定义的callback，他们通过全局psinfo而来：

static struct ramoops_context oops_cxt = {
	.pstore = {
		.owner	= THIS_MODULE,
		.name	= "ramoops",
		.open	= ramoops_pstore_open,
		.read	= ramoops_pstore_read, //tj: psi->read
		.write_buf	= ramoops_pstore_write_buf, //tj: for non pmsg
		.write_buf_user	= ramoops_pstore_write_buf_user, //tj: for pmsg
		.erase	= ramoops_pstore_erase,
	},
};

pstore RAM backend是通过persistent ram(ram_core.c)来处理，这个persist ram来源Android, mark to check later.

commit cddb8751c80348df75149f44fc3bf38d3dd1f3e6
Author: Anton Vorontsov <anton.vorontsov@linaro.org>
Date:   Thu May 17 00:15:08 2012 -0700

    staging: android: persistent_ram: Move to fs/pstore/ram_core.c
    
    This is a first step for adding ECC support for pstore RAM backend: we
    will use the persistent_ram routines, kindly provided by Google.
    
    Basically, persistent_ram is a set of helper routines to deal with the
    [optionally] ECC-protected persistent ram regions.
    
    A bit of Makefile, Kconfig and header files adjustments were needed
    because of the move.
    
    Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
    Acked-by: Kees Cook <keescook@chromium.org>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>