CFS之vruntime记录

tech2025-02-10 46

前言

之前学习调度相关内容的时候，对于这个vruntime有一些概念上的理解，但是没有实际跟踪过code，所以总是感觉处于一种似懂非懂的状态，今天share PELT计算过程的时候顺势跟踪了下这部分，于是跟以前了解的概念都串起来了，这里做下简单记录：在CFS中提出vruntime，为保证公平，为task分配相同的份额，即vruntime

每次在当前rq上找到runtime最小的执行（红黑树管理）；执行后该vruntime值更新，需注意，此处更新并非实际运行时间，而是实际运行时间与weight做完处理后的值（等同于nice值为0的处理时间）；根据weight决策当前的task实际执行时间；

1. 初始化

vruntime的这么重要，可以决策下一个执行的Task，那么每个Task的vruntime是如何得到的呢？直接上code：

/* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled */ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; if (curr) { update_curr(cfs_rq);//这里是fork操作，curr即父进程，首先更新父进程的vruntime； se->vruntime = curr->vruntime;//获取父进程的vruntime值； } place_entity(cfs_rq, se, 1);//根据权重更新子进程的vruntime if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { //如果标记了child first，且父进程vruntime < 子进程 vruntime，则swap /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); resched_curr(rq); } se->vruntime -= cfs_rq->min_vruntime;//将Task的vruntime - min_vruntime 插入 rq中的红黑树 raw_spin_unlock(&rq->lock); }

上述函数即一个Task被创建时，vruntime的初始化过程：

继承自父进程的vruntime更新得到子进程的vruntime 获取cfs_rq 的min_vruntime给min_vruntime + sched_vslice，这个sched_vslice就是根据当前task跟cfs上所有task的load比值获取到的实际执行时间，再换算成优先级为0的vruntime；取上述计算值与父进程值中较大的给到新创建的entity；如果有sysctl_sched_child_runs_first标记，且子进程vruntime > 父进程veruntime，则交换vruntime - cfs_rq min_vruntime插入cfs 的红黑树

也就是说，为保证公平，给所有task分配的vruntime都是按照优先级为0做的，在一定周期内所有task按照weight比例分配时间；但是实际执行时间是根据load比值来计算的；

1.1 核心在第二步，具体来看：

static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime;//获取到rq的min_vruntime if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice(cfs_rq, se);//关键步骤：根据rq所有task权重计算当前task可被分配的时间片 //非第一次进来的情况下执行，这里不关注 if (!initial) { unsigned long thresh = sysctl_sched_latency; if (sched_feat(GENTLE_FAIR_SLEEPERS)) thresh >>= 1; vruntime -= thresh; } se->vruntime = max_vruntime(se->vruntime, vruntime);//返回较大的一个 }

1.2 继续追踪sched_vslice：

这个函数实质就是根据权重计算task应该被分配到的时间

static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { return calc_delta_fair(sched_slice(cfs_rq, se), se); } // weight / load * slice 根据rq权重总值，计算当前task分配到的执行时间 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);//根据task总数获取到当前周期内待分配时间； for_each_sched_entity(se) {//获取到当前task struct load_weight *load; struct load_weight lw;//cfs_rq上的总的权重 cfs_rq = cfs_rq_of(se); load = &cfs_rq->load;//获取rq上权重总数 if (unlikely(!se->on_rq)) { lw = cfs_rq->load; update_load_add(&lw, se->load.weight);//累加当前task权重 load = &lw; } slice = __calc_delta(slice, se->load.weight, load);//计算 } return slice; } //这里区分了nice 为0 的情况，实际上说明，其他nice的Task的vruntime是按照nice为0计算的 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) delta = __calc_delta(delta, NICE_0_LOAD, &se->load);//计算 return delta; } //计算，实际就是delta_exec * weight / lw static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { u64 fact = scale_load_down(weight); int shift = WMULT_SHIFT; __update_inv_weight(lw); if (unlikely(fact >> 32)) { while (fact >> 32) { fact >>= 1; shift--; } } /* hint to use a 32x32->64 mul */ fact = (u64)(u32)fact * lw->inv_weight; while (fact >> 32) { fact >>= 1; shift--; } return mul_u64_u32_shr(delta_exec, fact, shift); }

2. 更新

按照我们的理解，vruntime在每次执行结束后都会更新，累加上当前执行的时间，则可以获取到下次执行的时刻：

static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_clock_task(rq_of(cfs_rq)); u64 delta_exec; if (unlikely(!curr)) return; delta_exec = now - curr->exec_start; if (unlikely((s64)delta_exec <= 0)) return; curr->exec_start = now; schedstat_set(curr->statistics.exec_max, max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr);//只关注这里，这里之前分析过了，即根据执行时间换算为nice为0的vruntime update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); }

在updata_curr时会更新vruntime 这里更新也很简单，使用实际执行时间 delta * nice_0 / weight，就是把实际执行时间换算成优先级为0的虚拟时间；也就是说有些执行时间与权重强相关；

3. 权重计算

这东西进程的权重的话，是个静态数组决定的，根据nice值从-20 ~ 20依次比例降低

/* * Nice levels are multiplicative, with a gentle 10% change for every * nice level changed. I.e. when a CPU-bound task goes from nice 0 to * nice 1, it will get ~10% less CPU time than another CPU-bound task * that remained on nice 0. * * The "10% effect" is relative and cumulative: from _any_ nice level, * if you go up 1 level, it's -10% CPU usage, if you go down 1 level * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. * If a task goes up by ~10% and another task goes down by ~10% then * the relative distance between them is ~25%.) */ const int sched_prio_to_weight[40] = { /* -20 */ 88761, 71755, 56483, 46273, 36291, /* -15 */ 29154, 23254, 18705, 14949, 11916, /* -10 */ 9548, 7620, 6100, 4904, 3906, /* -5 */ 3121, 2501, 1991, 1586, 1277, /* 0 */ 1024, 820, 655, 526, 423, /* 5 */ 335, 272, 215, 172, 137, /* 10 */ 110, 87, 70, 56, 45, /* 15 */ 36, 29, 23, 18, 15, }; /* * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. * * In cases where the weight does not change often, we can use the * precalculated inverse to speed up arithmetics by turning divisions * into multiplications: */ const u32 sched_prio_to_wmult[40] = { /* -20 */ 48388, 59856, 76040, 92818, 118348, /* -15 */ 147320, 184698, 229616, 287308, 360437, /* -10 */ 449829, 563644, 704093, 875809, 1099582, /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, };

按照设计来看，优先级每提高1，可以多占10%的系统资源；

最新回复(0)