分析ZRAM数据流操作 | TJ的技术博客

ZRAM基本

kernel3.18下代码路径在driver/block/zram下，主要是三部分：zram块设备驱动zram_drv.c，zram数据流操作zcomp.c以及压缩后台lzo/lz4接口
zram sysfs node path: /sys/block/zram0/
zram kernel doc: Documentation/blockdev/zram.txt

数据流创建

在zram驱动中设置sysfs disksize时,会根据sysfs max_comp_streams的配置数创建一个或多个数据流，代码看下：

struct zcomp *zcomp_create(const char *compress, int max_strm)
{
	struct zcomp *comp;
	struct zcomp_backend *backend;
	int error;

	backend = find_backend(compress);
	if (!backend)
		return ERR_PTR(-EINVAL);

	comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
	if (!comp)
		return ERR_PTR(-ENOMEM);

	comp->backend = backend;
	if (max_strm > 1)
		error = zcomp_strm_multi_create(comp, max_strm);
	else
		error = zcomp_strm_single_create(comp);
	if (error) {

找到backend(lzo/lz4)后用zcomp_strm_multi_create创建多个 or 用zcomp_strm_single_create创建单个压缩流。

先来看简单的单压缩流数据。

单数据流操作

struct zcomp_strm {
	/* compression/decompression buffer */
	void *buffer;
	/*
	 * The private data of the compression stream, only compression
	 * stream backend can touch this (e.g. compression algorithm
	 * working memory)
	 */
	void *private;
	/* used in multi stream backend, protected by backend strm_lock */
	struct list_head list;
};
/*
 * single zcomp_strm backend
 */
struct zcomp_strm_single {
	struct mutex strm_lock;
	struct zcomp_strm *zstrm;
};

用一个互斥锁strm_lock保护这一个数据流访问的串行化，buffer用于存放压缩后的数据，private是压缩backend用到的，压缩backend就是lzo/lz4了，看下压缩backend结构定义：

/* static compression backend */
struct zcomp_backend {
	int (*compress)(const unsigned char *src, unsigned char *dst,
			size_t *dst_len, void *private);

	int (*decompress)(const unsigned char *src, size_t src_len,
			unsigned char *dst);

	void *(*create)(void);
	void (*destroy)(void *private);

	const char *name;
};

这里的create就是给private用的了，也就是申请backend所需要的working memory。

static void *lzo_create(void)
{
	...
	ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY |
					__GFP_NOWARN);
	if (!ret)
		ret = __vmalloc(LZO1X_MEM_COMPRESS,
				GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN |
				__GFP_ZERO | __GFP_HIGHMEM,
				PAGE_KERNEL);
	return ret;
}

单压缩流创建时会先分配好这个zstrm并初始化互斥锁strm_lock，当zram block drv在rw时先通过zcomp_strm_single_find拿到zstrm（如果有其他进程占着就等着释放），用完后再用zcomp_strm_single_release解锁这个strm_lock。

static int zcomp_strm_single_create(struct zcomp *comp)
{
	struct zcomp_strm_single *zs;

	comp->destroy = zcomp_strm_single_destroy;
	comp->strm_find = zcomp_strm_single_find;
	comp->strm_release = zcomp_strm_single_release;
	comp->set_max_streams = zcomp_strm_single_set_max_streams;
	zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
	if (!zs)
		return -ENOMEM;

	comp->stream = zs;
	mutex_init(&zs->strm_lock);         //互斥锁初始化
	zs->zstrm = zcomp_strm_alloc(comp); //先分配好这个压缩流
	if (!zs->zstrm) {
		kfree(zs);
		return -ENOMEM;
	}
	return 0;
}

static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
{
	struct zcomp_strm_single *zs = comp->stream;
	mutex_lock(&zs->strm_lock);
	return zs->zstrm;
}

static void zcomp_strm_single_release(struct zcomp *comp,
		struct zcomp_strm *zstrm)
{
	struct zcomp_strm_single *zs = comp->stream;
	mutex_unlock(&zs->strm_lock);
}

多数据流操作

引入多压缩流的目的, 内核文档有说明：

Compression backend may use up to max_comp_streams compression
streams, thus allowing up to max_comp_streams concurrent compression
operations. By default, compression backend uses single compression
stream.

Note:
In order to enable compression backend’s multi stream support max_comp_streams
must be initially set to desired concurrency level before ZRAM device
initialisation. Once the device initialised as a single stream compression
backend (max_comp_streams equals to 1), you will see error if you try to change
the value of max_comp_streams because single stream compression backend
implemented as a special case by lock overhead issue and does not support
dynamic max_comp_streams. Only multi stream backend supports dynamic
max_comp_streams adjustment.

这里提到两个点：

max_comp_streams是用来增强zram的并发性concurrent，而不是并行性parallel，某国内手机厂家内核说明在此问题上提到了并行，我想是他们理解有错。
当zram使用单压缩流后不能动态修改为多压缩流，只有多压缩流支持运行时修改。

来看下上面的第2个问题：

static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
{
	/* zcomp_strm_single support only max_comp_streams == 1 */
	return false;
}

static ssize_t max_comp_streams_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
        ...
	down_write(&zram->init_lock);
	if (init_done(zram)) {         // disksize已配置过
		if (!zcomp_set_max_streams(zram->comp, num)) { // 如果当前是单压缩流模式，不能修改
			pr_info("Cannot change max compression streams\n");
			ret = -EINVAL;
			goto out;
		}
	}

	zram->max_comp_streams = num;
	ret = len;
out:
	up_write(&zram->init_lock);
	return ret;
}

/* change max_strm limit */
static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
{
	struct zcomp_strm_multi *zs = comp->stream;
	struct zcomp_strm *zstrm;

	spin_lock(&zs->strm_lock);
	zs->max_strm = num_strm;
	/*
	 * if user has lowered the limit and there are idle streams,
	 * immediately free as much streams (and memory) as we can.
	 */
	while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
		zstrm = list_entry(zs->idle_strm.next,
				struct zcomp_strm, list);
		list_del(&zstrm->list);
		zcomp_strm_free(comp, zstrm);
		zs->avail_strm--;
	}
	spin_unlock(&zs->strm_lock);
	return true;
}

多压缩流在运行中修改时，如果当前可用的avail_strm大于这个新配置，那就把多出来的idle_strm释放掉。

下面看下多压缩流的创建，查找申请，释放。

多压缩流创建

多压缩流的逻辑就是创建一个list，在使用时从中选择一个idle strm, 如果list为空那就是都被人占着了，要等待释放。

/*
 * multi zcomp_strm backend
 */
struct zcomp_strm_multi {
	/* protect strm list */
	spinlock_t strm_lock;
	/* max possible number of zstrm streams */
	int max_strm;
	/* number of available zstrm streams */
	int avail_strm;
	/* list of available strms */
	struct list_head idle_strm;
	wait_queue_head_t strm_wait;
};

static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
{
	struct zcomp_strm *zstrm;
	struct zcomp_strm_multi *zs;

	comp->destroy = zcomp_strm_multi_destroy;
	comp->strm_find = zcomp_strm_multi_find;
	comp->strm_release = zcomp_strm_multi_release;
	comp->set_max_streams = zcomp_strm_multi_set_max_streams;
	zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
	if (!zs)
		return -ENOMEM;

	comp->stream = zs;
	spin_lock_init(&zs->strm_lock);
	INIT_LIST_HEAD(&zs->idle_strm);
	init_waitqueue_head(&zs->strm_wait);
	zs->max_strm = max_strm;
	zs->avail_strm = 1;

	zstrm = zcomp_strm_alloc(comp);
	if (!zstrm) {
		kfree(zs);
		return -ENOMEM;
	}
	list_add(&zstrm->list, &zs->idle_strm);
	return 0;
}

由上，多压缩流结构中定义一个list idle_strm，使用单压缩流结构的backend data链入这个list来管理，具体就是zcomp_strm_alloc申请到了一个zstrm后把它list_add idle_strm这个list中。

avail_strm用来计数可用的idle stream。

strm_wait用来处理等待的情况。

多压缩流中查找可用压缩流

static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
{
	struct zcomp_strm_multi *zs = comp->stream;
	struct zcomp_strm *zstrm;

	while (1) {
		spin_lock(&zs->strm_lock);
		if (!list_empty(&zs->idle_strm)) {
			zstrm = list_entry(zs->idle_strm.next,
					struct zcomp_strm, list);
			list_del(&zstrm->list);
			spin_unlock(&zs->strm_lock);
			return zstrm;
		}
		/* zstrm streams limit reached, wait for idle stream */
		if (zs->avail_strm >= zs->max_strm) {
			spin_unlock(&zs->strm_lock);
			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
			continue;
		}
		/* allocate new zstrm stream */
		zs->avail_strm++;
		spin_unlock(&zs->strm_lock);

		zstrm = zcomp_strm_alloc(comp);
		if (!zstrm) {
			spin_lock(&zs->strm_lock);
			zs->avail_strm--;
			spin_unlock(&zs->strm_lock);
			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
			continue;
		}
		break;
	}
	return zstrm;
}

先看可用列表中有没有可用的，如果有就找到了。如果可用列表空了，那就再分配资源，当然前提是资源数不能超过最大压缩数max_strm，如果超过了就等待其他进程释放资源。

ps: 这里可以看到在zcomp_strm_alloc后并没有直接list_add加入可用列表，而是在释放时加入，应该是和上限逻辑有关。

多压缩流释放

zram block driver在find中找到某个压缩流，使用完后会释放这块资源。具体是：

/* add stream back to idle list and wake up waiter or free the stream */
static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
{
	struct zcomp_strm_multi *zs = comp->stream;

	spin_lock(&zs->strm_lock);
	if (zs->avail_strm <= zs->max_strm) {
		list_add(&zstrm->list, &zs->idle_strm);
		spin_unlock(&zs->strm_lock);
		wake_up(&zs->strm_wait);
		return;
	}

	zs->avail_strm--;
	spin_unlock(&zs->strm_lock);
	zcomp_strm_free(comp, zstrm);
}

释放时max_strm范围内list_add加入可用列表并唤醒等待该压缩流的进程 or 范围外就释放资源。

ZRAM callstack

 [<c010cd84>] (unwind_backtrace) from [<c01099f4>] (show_stack+0x10/0x14)
[   65.015711] [<c01099f4>] (show_stack) from [<c0baeaa4>] (dump_stack+0x78/0x98)
[   65.015721] [<c0baeaa4>] (dump_stack) from [<c052acac>] (zram_make_request+0x12c/0x474)
[   65.015731] [<c052acac>] (zram_make_request) from [<c035278c>] (generic_make_request+0x90/0xc8)
[   65.015741] [<c035278c>] (generic_make_request) from [<c03528d8>] (submit_bio+0x114/0x15c)
[   65.015752] [<c03528d8>] (submit_bio) from [<c0206ce4>] (__swap_writepage+0x260/0x27c)
[   65.015762] [<c0206ce4>] (__swap_writepage) from [<c01e203c>] (shrink_page_list+0x57c/0x9b0)
[   65.015771] [<c01e203c>] (shrink_page_list) from [<c01e2658>] (reclaim_pages_from_list+0xa8/0x108)
[   65.015781] [<c01e2658>] (reclaim_pages_from_list) from [<c0263968>] (reclaim_pte_range+0x11c/0x184)
[   65.015791] [<c0263968>] (reclaim_pte_range) from [<c0203ce8>] (walk_page_range+0x1b8/0x248)
[   65.015801] [<c0203ce8>] (walk_page_range) from [<c026462c>] (reclaim_task_anon+0xb4/0x11c)
[   65.015810] [<c026462c>] (reclaim_task_anon) from [<c0214524>] (swap_fn+0x248/0x4ec)
[   65.015821] [<c0214524>] (swap_fn) from [<c0137434>] (process_one_work+0x254/0x464)
[   65.015830] [<c0137434>] (process_one_work) from [<c013816c>] (worker_thread+0x2b4/0x3f8)
[   65.015839] [<c013816c>] (worker_thread) from [<c013b9b4>] (kthread+0xdc/0xf0)
[   65.015849] [<c013b9b4>] (kthread) from [<c0105f80>] (ret_from_fork+0x14/0x34)

] CPU: 1 PID: 2737 Comm: ngs.android.pop Tainted: G        W      3.18.31-perf-g5e6acc4-00677-gb0cabe3-dirty #9
[   65.014502] [<c010cd84>] (unwind_backtrace) from [<c01099f4>] (show_stack+0x10/0x14)
[   65.014514] [<c01099f4>] (show_stack) from [<c0baeaa4>] (dump_stack+0x78/0x98)
[   65.014527] [<c0baeaa4>] (dump_stack) from [<c052acac>] (zram_make_request+0x12c/0x474)
[   65.014540] [<c052acac>] (zram_make_request) from [<c035278c>] (generic_make_request+0x90/0xc8)
[   65.014550] [<c035278c>] (generic_make_request) from [<c03528d8>] (submit_bio+0x114/0x15c)
[   65.014561] [<c03528d8>] (submit_bio) from [<c0206e18>] (swap_readpage+0xdc/0xf0)
[   65.014571] [<c0206e18>] (swap_readpage) from [<c0207430>] (read_swap_cache_async+0x154/0x1cc)
[   65.014582] [<c0207430>] (read_swap_cache_async) from [<c0207618>] (swapin_readahead+0x170/0x184)
[   65.014592] [<c0207618>] (swapin_readahead) from [<c01f916c>] (handle_mm_fault+0x490/0x904)
[   65.014603] [<c01f916c>] (handle_mm_fault) from [<c0114590>] (do_page_fault+0x118/0x378)
[   65.014612] [<c0114590>] (do_page_fault) from [<c010030c>] (do_DataAbort+0x34/0x164)
[   65.014620] [<c010030c>] (do_DataAbort) from [<c010a6fc>] (__dabt_usr+0x3c/0x40)
[   65.014625] Exception stack(0xe4895fb0 to 0xe4895ff8)