ZRAM基本

  • kernel3.18下代码路径在driver/block/zram下,主要是三部分:zram块设备驱动zram_drv.c,zram数据流操作zcomp.c以及压缩后台lzo/lz4接口
  • zram sysfs node path: /sys/block/zram0/
  • zram kernel doc: Documentation/blockdev/zram.txt

数据流创建

在zram驱动中设置sysfs disksize时,会根据sysfs max_comp_streams的配置数创建一个或多个数据流,代码看下:

struct zcomp *zcomp_create(const char *compress, int max_strm)
{
    struct zcomp *comp;
    struct zcomp_backend *backend;
    int error;

    backend = find_backend(compress);
    if (!backend)
        return ERR_PTR(-EINVAL);

    comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
    if (!comp)
        return ERR_PTR(-ENOMEM);

    comp->backend = backend;
    if (max_strm > 1)
        error = zcomp_strm_multi_create(comp, max_strm);
    else
        error = zcomp_strm_single_create(comp);
    if (error) {

找到backend(lzo/lz4)后用zcomp_strm_multi_create创建多个 or 用zcomp_strm_single_create创建单个压缩流。

先来看简单的单压缩流数据。

单数据流操作

struct zcomp_strm {
    /* compression/decompression buffer */
    void *buffer;
    /*
     * The private data of the compression stream, only compression
     * stream backend can touch this (e.g. compression algorithm
     * working memory)
     */
    void *private;
    /* used in multi stream backend, protected by backend strm_lock */
    struct list_head list;
};
/*
 * single zcomp_strm backend
 */
struct zcomp_strm_single {
    struct mutex strm_lock;
    struct zcomp_strm *zstrm;
};

用一个互斥锁strm_lock保护这一个数据流访问的串行化,buffer用于存放压缩后的数据,private是压缩backend用到的,压缩backend就是lzo/lz4了,看下压缩backend结构定义:

/* static compression backend */
struct zcomp_backend {
    int (*compress)(const unsigned char *src, unsigned char *dst,
            size_t *dst_len, void *private);

    int (*decompress)(const unsigned char *src, size_t src_len,
            unsigned char *dst);

    void *(*create)(void);
    void (*destroy)(void *private);

    const char *name;
};

这里的create就是给private用的了,也就是申请backend所需要的working memory。

static void *lzo_create(void)
{
    ...
    ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY |
                    __GFP_NOWARN);
    if (!ret)
        ret = __vmalloc(LZO1X_MEM_COMPRESS,
                GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN |
                __GFP_ZERO | __GFP_HIGHMEM,
                PAGE_KERNEL);
    return ret;
}

单压缩流创建时会先分配好这个zstrm并初始化互斥锁strm_lock,当zram block drv在rw时先通过zcomp_strm_single_find拿到zstrm(如果有其他进程占着就等着释放),用完后再用zcomp_strm_single_release解锁这个strm_lock。

static int zcomp_strm_single_create(struct zcomp *comp)
{
    struct zcomp_strm_single *zs;

    comp->destroy = zcomp_strm_single_destroy;
    comp->strm_find = zcomp_strm_single_find;
    comp->strm_release = zcomp_strm_single_release;
    comp->set_max_streams = zcomp_strm_single_set_max_streams;
    zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
    if (!zs)
        return -ENOMEM;

    comp->stream = zs;
    mutex_init(&zs->strm_lock);         //互斥锁初始化
    zs->zstrm = zcomp_strm_alloc(comp); //先分配好这个压缩流
    if (!zs->zstrm) {
        kfree(zs);
        return -ENOMEM;
    }
    return 0;
}

static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
{
    struct zcomp_strm_single *zs = comp->stream;
    mutex_lock(&zs->strm_lock);
    return zs->zstrm;
}

static void zcomp_strm_single_release(struct zcomp *comp,
        struct zcomp_strm *zstrm)
{
    struct zcomp_strm_single *zs = comp->stream;
    mutex_unlock(&zs->strm_lock);
}

多数据流操作

引入多压缩流的目的, 内核文档有说明:

Compression backend may use up to max_comp_streams compression
streams, thus allowing up to max_comp_streams concurrent compression

  1. By default, compression backend uses single compression
    stream.

Note:
In order to enable compression backend's multi stream support max_comp_streams
must be initially set to desired concurrency level before ZRAM device

  1. Once the device initialised as a single stream compression
  2. (max_comp_streams equals to 1), you will see error if you try to change

the value of max_comp_streams because single stream compression backend
implemented as a special case by lock overhead issue and does not support
dynamic max_comp_streams. Only multi stream backend supports dynamic
max_comp_streams adjustment.

这里提到两个点:

  1. max_comp_streams是用来增强zram的并发性concurrent,而不是并行性parallel,某国内手机厂家内核说明在此问题上提到了并行,我想是他们理解有错。
  2. 当zram使用单压缩流后不能动态修改为多压缩流,只有多压缩流支持运行时修改。

来看下上面的第2个问题:

static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
{
    /* zcomp_strm_single support only max_comp_streams == 1 */
    return false;
}

static ssize_t max_comp_streams_store(struct device *dev,
        struct device_attribute *attr, const char *buf, size_t len)
{
        ...
    down_write(&zram->init_lock);
    if (init_done(zram)) {         // disksize已配置过
        if (!zcomp_set_max_streams(zram->comp, num)) { // 如果当前是单压缩流模式,不能修改
            pr_info("Cannot change max compression streams\n");
            ret = -EINVAL;
            goto out;
        }
    }

    zram->max_comp_streams = num;
    ret = len;
out:
    up_write(&zram->init_lock);
    return ret;
}

/* change max_strm limit */
static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
{
    struct zcomp_strm_multi *zs = comp->stream;
    struct zcomp_strm *zstrm;

    spin_lock(&zs->strm_lock);
    zs->max_strm = num_strm;
    /*
     * if user has lowered the limit and there are idle streams,
     * immediately free as much streams (and memory) as we can.
     */
    while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
        zstrm = list_entry(zs->idle_strm.next,
                struct zcomp_strm, list);
        list_del(&zstrm->list);
        zcomp_strm_free(comp, zstrm);
        zs->avail_strm--;
    }
    spin_unlock(&zs->strm_lock);
    return true;
}

多压缩流在运行中修改时,如果当前可用的avail_strm大于这个新配置,那就把多出来的idle_strm释放掉。

下面看下多压缩流的创建,查找申请,释放。

多压缩流创建

多压缩流的逻辑就是创建一个list,在使用时从中选择一个idle strm, 如果list为空那就是都被人占着了,要等待释放。

/*
 * multi zcomp_strm backend
 */
struct zcomp_strm_multi {
    /* protect strm list */
    spinlock_t strm_lock;
    /* max possible number of zstrm streams */
    int max_strm;
    /* number of available zstrm streams */
    int avail_strm;
    /* list of available strms */
    struct list_head idle_strm;
    wait_queue_head_t strm_wait;
};

static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
{
    struct zcomp_strm *zstrm;
    struct zcomp_strm_multi *zs;

    comp->destroy = zcomp_strm_multi_destroy;
    comp->strm_find = zcomp_strm_multi_find;
    comp->strm_release = zcomp_strm_multi_release;
    comp->set_max_streams = zcomp_strm_multi_set_max_streams;
    zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
    if (!zs)
        return -ENOMEM;

    comp->stream = zs;
    spin_lock_init(&zs->strm_lock);
    INIT_LIST_HEAD(&zs->idle_strm);
    init_waitqueue_head(&zs->strm_wait);
    zs->max_strm = max_strm;
    zs->avail_strm = 1;

    zstrm = zcomp_strm_alloc(comp);
    if (!zstrm) {
        kfree(zs);
        return -ENOMEM;
    }
    list_add(&zstrm->list, &zs->idle_strm);
    return 0;
}

由上,多压缩流结构中定义一个list idle_strm,使用单压缩流结构的backend data链入这个list来管理,具体就是zcomp_strm_alloc申请到了一个zstrm后把它list_add idle_strm这个list中。

avail_strm用来计数可用的idle stream。

strm_wait用来处理等待的情况。

多压缩流中查找可用压缩流

static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
{
    struct zcomp_strm_multi *zs = comp->stream;
    struct zcomp_strm *zstrm;

    while (1) {
        spin_lock(&zs->strm_lock);
        if (!list_empty(&zs->idle_strm)) {
            zstrm = list_entry(zs->idle_strm.next,
                    struct zcomp_strm, list);
            list_del(&zstrm->list);
            spin_unlock(&zs->strm_lock);
            return zstrm;
        }
        /* zstrm streams limit reached, wait for idle stream */
        if (zs->avail_strm >= zs->max_strm) {
            spin_unlock(&zs->strm_lock);
            wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
            continue;
        }
        /* allocate new zstrm stream */
        zs->avail_strm++;
        spin_unlock(&zs->strm_lock);

        zstrm = zcomp_strm_alloc(comp);
        if (!zstrm) {
            spin_lock(&zs->strm_lock);
            zs->avail_strm--;
            spin_unlock(&zs->strm_lock);
            wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
            continue;
        }
        break;
    }
    return zstrm;
}

先看可用列表中有没有可用的,如果有就找到了。如果可用列表空了,那就再分配资源,当然前提是资源数不能超过最大压缩数max_strm,如果超过了就等待其他进程释放资源。

ps: 这里可以看到在zcomp_strm_alloc后并没有直接list_add加入可用列表,而是在释放时加入,应该是和上限逻辑有关。

多压缩流释放

zram block driver在find中找到某个压缩流,使用完后会释放这块资源。具体是:

/* add stream back to idle list and wake up waiter or free the stream */
static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
{
    struct zcomp_strm_multi *zs = comp->stream;

    spin_lock(&zs->strm_lock);
    if (zs->avail_strm <= zs->max_strm) {
        list_add(&zstrm->list, &zs->idle_strm);
        spin_unlock(&zs->strm_lock);
        wake_up(&zs->strm_wait);
        return;
    }

    zs->avail_strm--;
    spin_unlock(&zs->strm_lock);
    zcomp_strm_free(comp, zstrm);
}

释放时max_strm范围内list_add加入可用列表并唤醒等待该压缩流的进程 or 范围外就释放资源。

ZRAM callstack

 [<c010cd84>] (unwind_backtrace) from [<c01099f4>] (show_stack+0x10/0x14)
[   65.015711] [<c01099f4>] (show_stack) from [<c0baeaa4>] (dump_stack+0x78/0x98)
[   65.015721] [<c0baeaa4>] (dump_stack) from [<c052acac>] (zram_make_request+0x12c/0x474)
[   65.015731] [<c052acac>] (zram_make_request) from [<c035278c>] (generic_make_request+0x90/0xc8)
[   65.015741] [<c035278c>] (generic_make_request) from [<c03528d8>] (submit_bio+0x114/0x15c)
[   65.015752] [<c03528d8>] (submit_bio) from [<c0206ce4>] (__swap_writepage+0x260/0x27c)
[   65.015762] [<c0206ce4>] (__swap_writepage) from [<c01e203c>] (shrink_page_list+0x57c/0x9b0)
[   65.015771] [<c01e203c>] (shrink_page_list) from [<c01e2658>] (reclaim_pages_from_list+0xa8/0x108)
[   65.015781] [<c01e2658>] (reclaim_pages_from_list) from [<c0263968>] (reclaim_pte_range+0x11c/0x184)
[   65.015791] [<c0263968>] (reclaim_pte_range) from [<c0203ce8>] (walk_page_range+0x1b8/0x248)
[   65.015801] [<c0203ce8>] (walk_page_range) from [<c026462c>] (reclaim_task_anon+0xb4/0x11c)
[   65.015810] [<c026462c>] (reclaim_task_anon) from [<c0214524>] (swap_fn+0x248/0x4ec)
[   65.015821] [<c0214524>] (swap_fn) from [<c0137434>] (process_one_work+0x254/0x464)
[   65.015830] [<c0137434>] (process_one_work) from [<c013816c>] (worker_thread+0x2b4/0x3f8)
[   65.015839] [<c013816c>] (worker_thread) from [<c013b9b4>] (kthread+0xdc/0xf0)
[   65.015849] [<c013b9b4>] (kthread) from [<c0105f80>] (ret_from_fork+0x14/0x34)
] CPU: 1 PID: 2737 Comm: ngs.android.pop Tainted: G        W      3.18.31-perf-g5e6acc4-00677-gb0cabe3-dirty #9
[   65.014502] [<c010cd84>] (unwind_backtrace) from [<c01099f4>] (show_stack+0x10/0x14)
[   65.014514] [<c01099f4>] (show_stack) from [<c0baeaa4>] (dump_stack+0x78/0x98)
[   65.014527] [<c0baeaa4>] (dump_stack) from [<c052acac>] (zram_make_request+0x12c/0x474)
[   65.014540] [<c052acac>] (zram_make_request) from [<c035278c>] (generic_make_request+0x90/0xc8)
[   65.014550] [<c035278c>] (generic_make_request) from [<c03528d8>] (submit_bio+0x114/0x15c)
[   65.014561] [<c03528d8>] (submit_bio) from [<c0206e18>] (swap_readpage+0xdc/0xf0)
[   65.014571] [<c0206e18>] (swap_readpage) from [<c0207430>] (read_swap_cache_async+0x154/0x1cc)
[   65.014582] [<c0207430>] (read_swap_cache_async) from [<c0207618>] (swapin_readahead+0x170/0x184)
[   65.014592] [<c0207618>] (swapin_readahead) from [<c01f916c>] (handle_mm_fault+0x490/0x904)
[   65.014603] [<c01f916c>] (handle_mm_fault) from [<c0114590>] (do_page_fault+0x118/0x378)
[   65.014612] [<c0114590>] (do_page_fault) from [<c010030c>] (do_DataAbort+0x34/0x164)
[   65.014620] [<c010030c>] (do_DataAbort) from [<c010a6fc>] (__dabt_usr+0x3c/0x40)
[   65.014625] Exception stack(0xe4895fb0 to 0xe4895ff8)