ZRAM基本

  • kernel3.18下代码路径在driver/block/zram下,主要是三部分:zram块设备驱动zram_drv.c,zram数据流操作zcomp.c以及压缩后台lzo/lz4接口
  • zram sysfs node path: /sys/block/zram0/
  • zram kernel doc: Documentation/blockdev/zram.txt

数据流创建

在zram驱动中设置sysfs disksize时,会根据sysfs max_comp_streams的配置数创建一个或多个数据流,代码看下:

struct zcomp *zcomp_create(const char *compress, int max_strm)
{
struct zcomp *comp;
struct zcomp_backend *backend;
int error;

backend = find_backend(compress);
if (!backend)
return ERR_PTR(-EINVAL);

comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
if (!comp)
return ERR_PTR(-ENOMEM);

comp->backend = backend;
if (max_strm > 1)
error = zcomp_strm_multi_create(comp, max_strm);
else
error = zcomp_strm_single_create(comp);
if (error) {

找到backend(lzo/lz4)后用zcomp_strm_multi_create创建多个 or 用zcomp_strm_single_create创建单个压缩流。

先来看简单的单压缩流数据。

单数据流操作

struct zcomp_strm {
/* compression/decompression buffer */
void *buffer;
/*
* The private data of the compression stream, only compression
* stream backend can touch this (e.g. compression algorithm
* working memory)
*/
void *private;
/* used in multi stream backend, protected by backend strm_lock */
struct list_head list;
};
/*
* single zcomp_strm backend
*/
struct zcomp_strm_single {
struct mutex strm_lock;
struct zcomp_strm *zstrm;
};

用一个互斥锁strm_lock保护这一个数据流访问的串行化,buffer用于存放压缩后的数据,private是压缩backend用到的,压缩backend就是lzo/lz4了,看下压缩backend结构定义:

/* static compression backend */
struct zcomp_backend {
int (*compress)(const unsigned char *src, unsigned char *dst,
size_t *dst_len, void *private);

int (*decompress)(const unsigned char *src, size_t src_len,
unsigned char *dst);

void *(*create)(void);
void (*destroy)(void *private);

const char *name;
};

这里的create就是给private用的了,也就是申请backend所需要的working memory。

static void *lzo_create(void)
{
...
ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY |
__GFP_NOWARN);
if (!ret)
ret = __vmalloc(LZO1X_MEM_COMPRESS,
GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN |
__GFP_ZERO | __GFP_HIGHMEM,
PAGE_KERNEL);
return ret;
}

单压缩流创建时会先分配好这个zstrm并初始化互斥锁strm_lock,当zram block drv在rw时先通过zcomp_strm_single_find拿到zstrm(如果有其他进程占着就等着释放),用完后再用zcomp_strm_single_release解锁这个strm_lock。

static int zcomp_strm_single_create(struct zcomp *comp)
{
struct zcomp_strm_single *zs;

comp->destroy = zcomp_strm_single_destroy;
comp->strm_find = zcomp_strm_single_find;
comp->strm_release = zcomp_strm_single_release;
comp->set_max_streams = zcomp_strm_single_set_max_streams;
zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
if (!zs)
return -ENOMEM;

comp->stream = zs;
mutex_init(&zs->strm_lock); //互斥锁初始化
zs->zstrm = zcomp_strm_alloc(comp); //先分配好这个压缩流
if (!zs->zstrm) {
kfree(zs);
return -ENOMEM;
}
return 0;
}

static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
{
struct zcomp_strm_single *zs = comp->stream;
mutex_lock(&zs->strm_lock);
return zs->zstrm;
}

static void zcomp_strm_single_release(struct zcomp *comp,
struct zcomp_strm *zstrm)
{
struct zcomp_strm_single *zs = comp->stream;
mutex_unlock(&zs->strm_lock);
}

多数据流操作

引入多压缩流的目的, 内核文档有说明:

Compression backend may use up to max_comp_streams compression
streams, thus allowing up to max_comp_streams concurrent compression
operations. By default, compression backend uses single compression
stream.

Note:
In order to enable compression backend’s multi stream support max_comp_streams
must be initially set to desired concurrency level before ZRAM device
initialisation. Once the device initialised as a single stream compression
backend (max_comp_streams equals to 1), you will see error if you try to change
the value of max_comp_streams because single stream compression backend
implemented as a special case by lock overhead issue and does not support
dynamic max_comp_streams. Only multi stream backend supports dynamic
max_comp_streams adjustment.

这里提到两个点:

  1. max_comp_streams是用来增强zram的并发性concurrent,而不是并行性parallel,某国内手机厂家内核说明在此问题上提到了并行,我想是他们理解有错。
  2. 当zram使用单压缩流后不能动态修改为多压缩流,只有多压缩流支持运行时修改。

来看下上面的第2个问题:

static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
{
/* zcomp_strm_single support only max_comp_streams == 1 */
return false;
}

static ssize_t max_comp_streams_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
...
down_write(&zram->init_lock);
if (init_done(zram)) { // disksize已配置过
if (!zcomp_set_max_streams(zram->comp, num)) { // 如果当前是单压缩流模式,不能修改
pr_info("Cannot change max compression streams\n");
ret = -EINVAL;
goto out;
}
}

zram->max_comp_streams = num;
ret = len;
out:
up_write(&zram->init_lock);
return ret;
}

/* change max_strm limit */
static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
{
struct zcomp_strm_multi *zs = comp->stream;
struct zcomp_strm *zstrm;

spin_lock(&zs->strm_lock);
zs->max_strm = num_strm;
/*
* if user has lowered the limit and there are idle streams,
* immediately free as much streams (and memory) as we can.
*/
while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
zstrm = list_entry(zs->idle_strm.next,
struct zcomp_strm, list);
list_del(&zstrm->list);
zcomp_strm_free(comp, zstrm);
zs->avail_strm--;
}
spin_unlock(&zs->strm_lock);
return true;
}

多压缩流在运行中修改时,如果当前可用的avail_strm大于这个新配置,那就把多出来的idle_strm释放掉。

下面看下多压缩流的创建,查找申请,释放。

多压缩流创建

多压缩流的逻辑就是创建一个list,在使用时从中选择一个idle strm, 如果list为空那就是都被人占着了,要等待释放。

/*
* multi zcomp_strm backend
*/
struct zcomp_strm_multi {
/* protect strm list */
spinlock_t strm_lock;
/* max possible number of zstrm streams */
int max_strm;
/* number of available zstrm streams */
int avail_strm;
/* list of available strms */
struct list_head idle_strm;
wait_queue_head_t strm_wait;
};

static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
{
struct zcomp_strm *zstrm;
struct zcomp_strm_multi *zs;

comp->destroy = zcomp_strm_multi_destroy;
comp->strm_find = zcomp_strm_multi_find;
comp->strm_release = zcomp_strm_multi_release;
comp->set_max_streams = zcomp_strm_multi_set_max_streams;
zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
if (!zs)
return -ENOMEM;

comp->stream = zs;
spin_lock_init(&zs->strm_lock);
INIT_LIST_HEAD(&zs->idle_strm);
init_waitqueue_head(&zs->strm_wait);
zs->max_strm = max_strm;
zs->avail_strm = 1;

zstrm = zcomp_strm_alloc(comp);
if (!zstrm) {
kfree(zs);
return -ENOMEM;
}
list_add(&zstrm->list, &zs->idle_strm);
return 0;
}

由上,多压缩流结构中定义一个list idle_strm,使用单压缩流结构的backend data链入这个list来管理,具体就是zcomp_strm_alloc申请到了一个zstrm后把它list_add idle_strm这个list中。

avail_strm用来计数可用的idle stream。

strm_wait用来处理等待的情况。

多压缩流中查找可用压缩流

static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
{
struct zcomp_strm_multi *zs = comp->stream;
struct zcomp_strm *zstrm;

while (1) {
spin_lock(&zs->strm_lock);
if (!list_empty(&zs->idle_strm)) {
zstrm = list_entry(zs->idle_strm.next,
struct zcomp_strm, list);
list_del(&zstrm->list);
spin_unlock(&zs->strm_lock);
return zstrm;
}
/* zstrm streams limit reached, wait for idle stream */
if (zs->avail_strm >= zs->max_strm) {
spin_unlock(&zs->strm_lock);
wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
continue;
}
/* allocate new zstrm stream */
zs->avail_strm++;
spin_unlock(&zs->strm_lock);

zstrm = zcomp_strm_alloc(comp);
if (!zstrm) {
spin_lock(&zs->strm_lock);
zs->avail_strm--;
spin_unlock(&zs->strm_lock);
wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
continue;
}
break;
}
return zstrm;
}

先看可用列表中有没有可用的,如果有就找到了。如果可用列表空了,那就再分配资源,当然前提是资源数不能超过最大压缩数max_strm,如果超过了就等待其他进程释放资源。

ps: 这里可以看到在zcomp_strm_alloc后并没有直接list_add加入可用列表,而是在释放时加入,应该是和上限逻辑有关。

多压缩流释放

zram block driver在find中找到某个压缩流,使用完后会释放这块资源。具体是:

/* add stream back to idle list and wake up waiter or free the stream */
static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
{
struct zcomp_strm_multi *zs = comp->stream;

spin_lock(&zs->strm_lock);
if (zs->avail_strm <= zs->max_strm) {
list_add(&zstrm->list, &zs->idle_strm);
spin_unlock(&zs->strm_lock);
wake_up(&zs->strm_wait);
return;
}

zs->avail_strm--;
spin_unlock(&zs->strm_lock);
zcomp_strm_free(comp, zstrm);
}

释放时max_strm范围内list_add加入可用列表并唤醒等待该压缩流的进程 or 范围外就释放资源。

ZRAM callstack

 [<c010cd84>] (unwind_backtrace) from [<c01099f4>] (show_stack+0x10/0x14)
[ 65.015711] [<c01099f4>] (show_stack) from [<c0baeaa4>] (dump_stack+0x78/0x98)
[ 65.015721] [<c0baeaa4>] (dump_stack) from [<c052acac>] (zram_make_request+0x12c/0x474)
[ 65.015731] [<c052acac>] (zram_make_request) from [<c035278c>] (generic_make_request+0x90/0xc8)
[ 65.015741] [<c035278c>] (generic_make_request) from [<c03528d8>] (submit_bio+0x114/0x15c)
[ 65.015752] [<c03528d8>] (submit_bio) from [<c0206ce4>] (__swap_writepage+0x260/0x27c)
[ 65.015762] [<c0206ce4>] (__swap_writepage) from [<c01e203c>] (shrink_page_list+0x57c/0x9b0)
[ 65.015771] [<c01e203c>] (shrink_page_list) from [<c01e2658>] (reclaim_pages_from_list+0xa8/0x108)
[ 65.015781] [<c01e2658>] (reclaim_pages_from_list) from [<c0263968>] (reclaim_pte_range+0x11c/0x184)
[ 65.015791] [<c0263968>] (reclaim_pte_range) from [<c0203ce8>] (walk_page_range+0x1b8/0x248)
[ 65.015801] [<c0203ce8>] (walk_page_range) from [<c026462c>] (reclaim_task_anon+0xb4/0x11c)
[ 65.015810] [<c026462c>] (reclaim_task_anon) from [<c0214524>] (swap_fn+0x248/0x4ec)
[ 65.015821] [<c0214524>] (swap_fn) from [<c0137434>] (process_one_work+0x254/0x464)
[ 65.015830] [<c0137434>] (process_one_work) from [<c013816c>] (worker_thread+0x2b4/0x3f8)
[ 65.015839] [<c013816c>] (worker_thread) from [<c013b9b4>] (kthread+0xdc/0xf0)
[ 65.015849] [<c013b9b4>] (kthread) from [<c0105f80>] (ret_from_fork+0x14/0x34)
] CPU: 1 PID: 2737 Comm: ngs.android.pop Tainted: G        W      3.18.31-perf-g5e6acc4-00677-gb0cabe3-dirty #9
[ 65.014502] [<c010cd84>] (unwind_backtrace) from [<c01099f4>] (show_stack+0x10/0x14)
[ 65.014514] [<c01099f4>] (show_stack) from [<c0baeaa4>] (dump_stack+0x78/0x98)
[ 65.014527] [<c0baeaa4>] (dump_stack) from [<c052acac>] (zram_make_request+0x12c/0x474)
[ 65.014540] [<c052acac>] (zram_make_request) from [<c035278c>] (generic_make_request+0x90/0xc8)
[ 65.014550] [<c035278c>] (generic_make_request) from [<c03528d8>] (submit_bio+0x114/0x15c)
[ 65.014561] [<c03528d8>] (submit_bio) from [<c0206e18>] (swap_readpage+0xdc/0xf0)
[ 65.014571] [<c0206e18>] (swap_readpage) from [<c0207430>] (read_swap_cache_async+0x154/0x1cc)
[ 65.014582] [<c0207430>] (read_swap_cache_async) from [<c0207618>] (swapin_readahead+0x170/0x184)
[ 65.014592] [<c0207618>] (swapin_readahead) from [<c01f916c>] (handle_mm_fault+0x490/0x904)
[ 65.014603] [<c01f916c>] (handle_mm_fault) from [<c0114590>] (do_page_fault+0x118/0x378)
[ 65.014612] [<c0114590>] (do_page_fault) from [<c010030c>] (do_DataAbort+0x34/0x164)
[ 65.014620] [<c010030c>] (do_DataAbort) from [<c010a6fc>] (__dabt_usr+0x3c/0x40)
[ 65.014625] Exception stack(0xe4895fb0 to 0xe4895ff8)