问题

workqueue name 就是 alloc_workqueue 的第一个参数:

/**
* alloc_workqueue - allocate a workqueue
* @fmt: printf format for the name of the workqueue
[...]
#define alloc_workqueue(fmt, flags, max_active, args...) \

前文已经说过如何使用 perf sched 来统计 task 调度延迟,但是如下图 workqueue name 被剪掉了:

-----------------------------------------------------------------------------------------------------------------
Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |
-----------------------------------------------------------------------------------------------------------------
kworker/4:2-eve:404 | 88.386 ms | 207 | avg: 0.248 ms | max: 12.380 ms | max at: 271.002611 s
kworker/u16:18-:1499 | 36.641 ms | 681 | avg: 0.081 ms | max: 4.086 ms | max at: 269.173878 s
kworker/u16:17-:1498 | 21.918 ms | 511 | avg: 0.080 ms | max: 2.706 ms | max at: 272.811798 s
kworker/u16:19-:1500 | 41.742 ms | 713 | avg: 0.077 ms | max: 4.098 ms | max at: 272.342235 s

那如何显示完整name了?我们直接看 perf 源码(kernel 4.19)。

perf 源码分析

打印入口:

static int perf_sched__lat(struct perf_sched *sched)
{
struct rb_node *next;

setup_pager();

if (perf_sched__read_events(sched))
return -1;

perf_sched__merge_lat(sched);
perf_sched__sort_lat(sched);

printf("\n -----------------------------------------------------------------------------------------------------------------\n");
printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n");
printf(" -----------------------------------------------------------------------------------------------------------------\n");

next = rb_first(&sched->sorted_atom_root);

while (next) {
struct work_atoms *work_list;

work_list = rb_entry(next, struct work_atoms, node);
output_lat_thread(sched, work_list);
next = rb_next(next);
thread__zput(work_list->thread);
}

应该是 output_lat_thread():

static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
{
int i;
int ret;
u64 avg;
char max_lat_at[32];

if (!work_list->nb_atoms)
return;
/*
* Ignore idle threads:
*/
if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
return;

sched->all_runtime += work_list->total_runtime;
sched->all_count += work_list->nb_atoms;

if (work_list->num_merged > 1)
ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
else
ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);

for (i = 0; i < 24 - ret; i++)

所以,那个 workqueue name 应该是来自 thread__comm_str(work_list->thread)

const char *thread__comm_str(const struct thread *thread)
{
const char *str;

down_read((struct rw_semaphore *)&thread->comm_lock);
str = __thread__comm_str(thread);
up_read((struct rw_semaphore *)&thread->comm_lock);

return str;
}
static const char *__thread__comm_str(const struct thread *thread)
{
const struct comm *comm = thread__comm(thread);

if (!comm)
return NULL;

return comm__str(comm);
}
struct comm *thread__comm(const struct thread *thread)
{
if (list_empty(&thread->comm_list))
return NULL;

return list_first_entry(&thread->comm_list, struct comm, list);
}

啊,原来是thread里有个链表 comm_list,从这个表里取出comm -> comm__str(comm):

const char *comm__str(const struct comm *comm)
{
return comm->comm_str->str;
}

大概看下 struct comm:

struct comm_str;

struct comm {
struct comm_str *comm_str;
u64 start;
struct list_head list;
bool exec;
union { /* Tool specific area */
void *priv;
u64 db_id;
};
};

util/comm.c里:

struct comm_str {
char *str;
struct rb_node rb_node;
refcount_t refcnt;
};

能看到 str 是一个 char * 的指针,那啥时候分配的?分配里不是有大小吗?这个comm.c文件里就有分配实现:

static struct comm_str *comm_str__alloc(const char *str)
{
struct comm_str *cs;

cs = zalloc(sizeof(*cs));
if (!cs)
return NULL;

cs->str = strdup(str); //tj: here
if (!cs->str) {
free(cs);
return NULL;
}

refcount_set(&cs->refcnt, 1);

return cs;
}

是按照入参str的长度来分配的,继续跟。

有两个caller: comm__newcomm__override,看下 comm__new:

struct thread *thread__new(pid_t pid, pid_t tid)
{
char *comm_str;
struct comm *comm;
struct thread *thread = zalloc(sizeof(*thread));

if (thread != NULL) {
thread->pid_ = pid;
thread->tid = tid;
thread->ppid = -1;
thread->cpu = -1;
INIT_LIST_HEAD(&thread->namespaces_list);
INIT_LIST_HEAD(&thread->comm_list);
init_rwsem(&thread->namespaces_lock);
init_rwsem(&thread->comm_lock);

comm_str = malloc(32);
if (!comm_str)
goto err_thread;

snprintf(comm_str, 32, ":%d", tid); //tj: here
comm = comm__new(comm_str, 0, false);
free(comm_str);
if (!comm)
goto err_thread;

list_add(&comm->list, &thread->comm_list);

分配了32个字节,应该够了,那实际用了多少?也就是这个 str 是如何被填入的,那就是 sched record喽。回头再看下相关code:

struct work_atom {
struct list_head list;
enum thread_state state;
u64 sched_out_time;
u64 wake_up_time;
u64 sched_in_time;
u64 runtime;
};

struct work_atoms {
struct list_head work_list;
struct thread *thread;
struct rb_node node;
u64 max_lat;
u64 max_lat_at;
u64 total_lat;
u64 nb_atoms;
u64 total_runtime;
int num_merged;
};

每一个 struct work_atom 定义的work都有相应的行为成员,work_atomlist用来关联到work_atoms

list_add_tail(&atom->list, &atoms->work_list);

看下work_atoms里的thread,这个 thread 里有个 comm_list,除了上文提到的thread_new()会add 一个comm到这个list以外,还有:

static int ____thread__set_comm(struct thread *thread, const char *str,
u64 timestamp, bool exec)
{
struct comm *new, *curr = thread__comm(thread);

/* Override the default :tid entry */
if (!thread->comm_set) {
int err = comm__override(curr, str, timestamp, exec);
if (err)
return err;
} else {
new = comm__new(str, timestamp, exec);
if (!new)
return -ENOMEM;
list_add(&new->list, &thread->comm_list); //tj: here

if (exec)
unwind__flush_access(thread);
}

thread->comm_set = true;

return 0;
}

if comm_set过了,就new一个comm。read这个set code的分支有点多,我们借用dump_stack()看下运行时:

+++ b/tools/perf/util/thread.c
@@ -218,6 +218,8 @@ static int ____thread__set_comm(struct thread *thread, const char *str,
{
struct comm *new, *curr = thread__comm(thread);

+ dump_stack();

能追到:

cmd_record > perf_session__process_events -> perf_session__deliver_event ->machines__deliver_event-> machine__process_comm_event

前面几个都ok,中间有个machines__deliver_event是怎么调到machine__process_comm_event的呢?来看下。

static int machines__deliver_event(struct machines *machines,
struct perf_evlist *evlist,
union perf_event *event,
struct perf_sample *sample,
struct perf_tool *tool, u64 file_offset)
{
[...]
switch (event->header.type) {
[...]
case PERF_RECORD_COMM:
return tool->comm(tool, event, sample, machine);

应该是这里,用了一个callback,看下builtin-sched.c/cmd sched:

int cmd_sched(int argc, const char **argv)
{
const char default_sort_order[] = "avg, max, switch, runtime";
struct perf_sched sched = {
.tool = {
.sample = perf_sched__process_tracepoint_sample,
.comm = perf_sched__process_comm, //tj: here

see? 这里有个tool的comm (perf_sched__process_comm),直觉告诉我就是它:)

static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
struct machine *machine)
{
struct thread *thread;
struct thread_runtime *tr;
int err;

err = perf_event__process_comm(tool, event, sample, machine);
if (err)
int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
struct machine *machine)
{
return machine__process_comm_event(machine, event, sample);
}

rt? 到这里整个call stack就清楚了。现在应该知道怎么改了,如果还不知道,check below: