Android Go的用户态lmk在探测内存压力时用到了memcg的如下内存使用统计:

#define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
#define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"

memcg是cgroup的一个子系统,那这两个文件是如何统计内存使用的,带着这个疑问来看下, kernel 3.18, msm平台。

How to read

static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
...
#ifdef CONFIG_MEMCG_SWAP
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},

统一入口mem_cgroup_read_u64:

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
enum res_type type = MEMFILE_TYPE(cft->private);
int name = MEMFILE_ATTR(cft->private);

switch (type) {
case _MEM:
if (name == RES_USAGE)
return mem_cgroup_usage(memcg, false);
return res_counter_read_u64(&memcg->res, name);
case _MEMSWAP:
if (name == RES_USAGE)
return mem_cgroup_usage(memcg, true);
return res_counter_read_u64(&memcg->memsw, name);

这里usage_in_bytesmemsw.usage_in_bytes都走了mem_cgroup_usage(,boot swap)

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
u64 val;

if (!mem_cgroup_is_root(memcg)) {
if (!swap)
return res_counter_read_u64(&memcg->res, RES_USAGE);
else
return res_counter_read_u64(&memcg->memsw, RES_USAGE);
}

/*
* Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
* as well as in MEM_CGROUP_STAT_RSS_HUGE.
*/
val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

if (swap)
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

return val << PAGE_SHIFT;
}

从字面上看,如果是root memcg那就走mem_cgroup_recursive_stat循环统计,如果是non root memcg那就直接call res_counter_read_u64去读res

res的内核文档设计说明:

2.1. Design

The core of the design is a counter called the res_counter. The res_counter
tracks the current memory usage and limit of the group of processes associated
with the controller. Each cgroup has a memory controller specific data
structure (mem_cgroup) associated with it.

2.2. Accounting

           +--------------------+
           |  mem_cgroup     |   
           |  (res_counter)     |   
           +--------------------+
            /            ^      \   
           /             |       \   
      +---------------+  |        +---------------+
      | mm_struct     |  |....    | mm_struct     |   
      |               |  |        |               |   
      +---------------+  |        +---------------+
                         |   
                         + --------------+
                                         |   
      +---------------+           +------+--------+
      | page          +---------->  page_cgroup|
      |               |           |               |   
      +---------------+           +---------------+

        (Figure 1: Hierarchy of Accounting)

Figure 1 shows the important aspects of the controller

  1. Accounting happens per cgroup
  2. Each mm_struct knows about which cgroup it belongs to
  3. Each page has a pointer to the page_cgroup, which in turn knows the
    cgroup it belongs to
struct mem_cgroup {
struct cgroup_subsys_state css;
/*
* the counter to account for memory usage
*/
struct res_counter res;

root memcg的统计用的是struct mem_cgroup_stat_cpucount

struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};

struct mem_cgroup {
...
/*
* percpu counter.
*/
struct mem_cgroup_stat_cpu __percpu *stat;
}

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
long val = 0;
int cpu;

for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
return val;
}

What is root memcg? 就是初始化时创建的cgroup就是root memory cgroup:

static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *memcg;
long error = -ENOMEM;
int node;

memcg = mem_cgroup_alloc();
if (!memcg)
return ERR_PTR(error);

for_each_node(node)
if (alloc_mem_cgroup_per_zone_info(memcg, node))
goto free_out;

/* root ? */
if (parent_css == NULL) {
root_mem_cgroup = memcg;
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
res_counter_init(&memcg->kmem, NULL);
}
cgroup_init -> cgroup_init_subsys -> mem_cgroup_css_alloc(NULL)

init(system/core/init/init.cpp)会创建non root memcg,如下:

// Set memcg property based on kernel cmdline argument
bool memcg_enabled = android::base::GetBoolProperty("ro.boot.memcg",false);
if (memcg_enabled) {
// root memory control cgroup
mkdir("/dev/memcg", 0700);
chown("/dev/memcg",AID_ROOT,AID_SYSTEM);
mount("none", "/dev/memcg", "cgroup", 0, "memory");
// app mem cgroups, used by activity manager, lmkd and zygote
mkdir("/dev/memcg/apps/",0755);
chown("/dev/memcg/apps/",AID_SYSTEM,AID_SYSTEM);
mkdir("/dev/memcg/system",0550);
chown("/dev/memcg/system",AID_SYSTEM,AID_SYSTEM);
}

non root的memcg创建:

cgroup_mkdir -> create_css -> mem_cgroup_css_alloc

ok, 那lmkd统计的mem usage其实就是root memcg的统计,root memcg就是把所有memcg的mem_cgroup_stat_cpu的count累加。

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
long val = 0;

/* Per-cpu values can be negative, use a signed accumulator */
for_each_mem_cgroup_tree(iter, memcg)
val += mem_cgroup_read_stat(iter, idx);

if (val < 0) /* race ? */
val = 0;
return val;
}

memcg使用多少内存统计的是 MEM_CGROUP_STAT_CACHE + MEM_CGROUP_STAT_RSS + MEM_CGROUP_STAT_SWAP。

看下系统cgroups的情况:

xxx:/dev/memcg/system # cat /proc/cgroups
#subsys_name hierarchy num_cgroups enabled
cpu 3 1 1
cpuacct 2 2 1
memory 1 128 1
freezer 0 1 1
debug 0 1 1

memory这个cgroup子系统就一个hierarchy,其id是1,这个hierarchy里包含了128个memcg。

了解下hierarchy:

  1. Hierarchy support

The memory controller supports a deep hierarchy and hierarchical accounting.
The hierarchy is created by creating the appropriate cgroups in the
cgroup filesystem. Consider for example, the following cgroup filesystem
hierarchy

          root
        /  |   \   
       /   |    \   
      a    b     c   
                 | \ 
                 |  \
                 d   e   

In the diagram above, with hierarchical accounting enabled, all memory
usage of e, is accounted to its ancestors up until the root (i.e, c and root),
that has memory.use_hierarchy enabled. If one of the ancestors goes over its
limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
children of the ancestor.

How to record

主要通过charge/uncharge来记录,文档说明:

  1. Charge
    a page/swp_entry may be charged (usage += PAGE_SIZE) at

    mem_cgroup_try_charge()
    
  2. Uncharge
    a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by

    mem_cgroup_uncharge()
      Called when a page's refcount goes down to 0.
    
    mem_cgroup_uncharge_swap()
      Called when swp_entry's refcnt goes down to 0. A charge against swap
      disappears.
    
  3. charge-commit-cancel
    Memcg pages are charged in two steps:
    mem_cgroup_try_charge()
    mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()

    At try_charge(), there are no flags to say “this page is charged”.
    at this point, usage += PAGE_SIZE.

    At commit(), the page is associated with the memcg.

    At cancel(), simply usage -= PAGE_SIZE.

对root memcg,mem_cgroup_try_charge就不统计res_counter:

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long nr_reclaimed;
unsigned long long size;
bool may_swap = true;
bool drained = false;
int ret = 0;

if (mem_cgroup_is_root(memcg))
goto done; // tj: if root memcg, not record for res_counter
...
done:
return ret;
}

cancel charge:

static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long bytes = nr_pages * PAGE_SIZE;

if (mem_cgroup_is_root(memcg))
return;

res_counter_uncharge(&memcg->res, bytes);
if (do_swap_account)
res_counter_uncharge(&memcg->memsw, bytes);
}

so non root memcg和root memcg是分开统计的,具体实现后面再看。

那什么时候去统计了,比如增加a page到page cache里时会try charge。

参考文档

  • kernel3.18/Documentation/cgroups/memory.txt
  • kernel3.18/Documentation/cgroups/memcg_test.txt