/*
8785   * select_task_rq_fair: Select target runqueue for the waking task in domains
8786   * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
8787   * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
8788   *
8789   * Balances load by selecting the idlest CPU in the idlest group, or under
8790   * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
8791   *
8792   * Returns the target CPU number.
8793   */
8794  static int
8795  select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
8796  {
8797  	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
8798  	struct sched_domain *tmp, *sd = NULL;
8799  	int cpu = smp_processor_id();
8800  	int new_cpu = prev_cpu;
8801  	int want_affine = 0;
8802  	int target_cpu = -1;
8803  	/* SD_flags and WF_flags share the first nibble */
8804  	int sd_flag = wake_flags & 0xF;
8805  
8806  	if (trace_android_rvh_select_task_rq_fair_enabled() &&
8807  	    !(sd_flag & SD_BALANCE_FORK))
8808  		sync_entity_load_avg(&p->se);
8809  	trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
8810  			wake_flags, &target_cpu);
8811  	if (target_cpu >= 0)
8812  		return target_cpu;
8813  
8814  	/*
8815  	 * required for stable ->cpus_allowed
8816  	 */
8817  	lockdep_assert_held(&p->pi_lock);
8818  	if (wake_flags & WF_TTWU) {
8819  		record_wakee(p);
8820  
                // 主要用于决定是否将某个任务(p)调度到当前 CPU(cpu)。它通过检查唤醒标志(wake_flags)和任务的允许运行 CPU 集合(cpus_ptr),来判断当前 CPU 是否是一个合适的调度目标
8821  		if ((wake_flags & WF_CURRENT_CPU) &&
8822  		    cpumask_test_cpu(cpu, p->cpus_ptr))
8823  			return cpu;
8824  
8825  		if (!is_rd_overutilized(this_rq()->rd)) {
8826  			new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
8827  			if (new_cpu >= 0)
8828  				return new_cpu;
8829  			new_cpu = prev_cpu;
8830  		}
8831  
8832  		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
8833  	}
8834  
8835  	rcu_read_lock();
8836  	for_each_domain(cpu, tmp) {
8837  		/*
8838  		 * If both 'cpu' and 'prev_cpu' are part of this domain,
8839  		 * cpu is a valid SD_WAKE_AFFINE target.
8840  		 */
8841  		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
8842  		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
8843  			if (cpu != prev_cpu)
8844  				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
8845  
8846  			sd = NULL; /* Prefer wake_affine over balance flags */
8847  			break;
8848  		}
8849  
8850  		/*
8851  		 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
8852  		 * usually do not have SD_BALANCE_WAKE set. That means wakeup
8853  		 * will usually go to the fast path.
8854  		 */
8855  		if (tmp->flags & sd_flag)
8856  			sd = tmp;
8857  		else if (!want_affine)
8858  			break;
8859  	}
8860  
8861  	if (unlikely(sd)) {
8862  		/* Slow path */
8863  		new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
8864  	} else if (wake_flags & WF_TTWU) { /* XXX always ? */
8865  		/* Fast path */
8866  		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8867  	}
8868  	rcu_read_unlock();
8869  
8870  	return new_cpu;
8871  }

是Linux 内核调度器中 CFS(完全公平调度器)的核心函数 select_task_rq_fair,用于为被唤醒的任务选择合适的运行队列(即选择目标 CPU)

一.SD_FLAG

1.常见的 sd_flag 值

标志 含义 典型场景
SD_BALANCE_WAKE 0x1 在唤醒时进行负载均衡 普通任务唤醒
SD_BALANCE_FORK 0x2 在fork时进行负载均衡 fork() 创建新进程
SD_BALANCE_EXEC 0x4 在exec时进行负载均衡 exec() 执行新程序

2.WF_TTWU - 唤醒类型标志

wake_flags & WF_TTWU   # 表示这是一个真正的任务唤醒操作

具体场景

WF_TTWU 被设置的情况​:

  • 任务从睡眠状态被唤醒(如:等待I/O完成、等待锁释放)
  • 任务从挂起状态恢复
  • 新创建的任务第一次被调度

WF_TTWU 未被设置的情况​:

  • fork() 时为新任务选择CPU(但任务还未真正运行)
  • exec() 时重新选择CPU
  • 负载均衡迁移时的CPU选择

 3.两种标志的关系和区别

特性 sd_flag WF_TTWU
用途 决定在哪个层次做负载均衡 决定是否进行完整的唤醒处理
取值 位掩码(多种组合) 布尔标志(是/否)
影响 选择慢速路径的调度域 启用快速路径优化
场景 与调度域拓扑相关 与唤醒类型相关

4. 实际工作流程示例

// 场景1:普通任务唤醒(WF_TTWU + SD_BALANCE_WAKE)
wake_up(task);  // 设置 WF_TTWU | SD_BALANCE_WAKE
→ 走快速路径:亲和性唤醒 + 选择空闲兄弟CPU

// 场景2:fork() 新任务(SD_BALANCE_FORK,无 WF_TTWU)
fork();  // 设置 SD_BALANCE_FORK
→ 走慢速路径:在调度域内负载均衡

// 场景3:exec() 执行新程序(SD_BALANCE_EXEC,无 WF_TTWU)  
exec();  // 设置 SD_BALANCE_EXEC
→ 走慢速路径:重新选择合适的CPU

5.总结

在代码中的关键作用

if (wake_flags & WF_TTWU) {
    record_wakee(p);                    // 记录唤醒关系
    want_affine = !wake_wide(p) && ...; // 判断是否使用亲和性唤醒
    
    // 只有真正的唤醒才尝试能量感知调度
    if (!is_rd_overutilized(this_rq()->rd)) {
        new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
    }
}

// ...

if (wake_flags & WF_TTWU) { /* XXX always ? */
    // 快速路径:寻找空闲兄弟CPU
    new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
}

// 场景1:系统轻载,EAS启用
is_rd_overutilized(rd) = false
→ 使用 find_energy_efficient_cpu() 寻找能效最优的CPU
→ 可能将任务迁移到小核以节省功耗

// 场景2:系统过载,EAS启用  
is_rd_overutilized(rd) = true
→ 跳过能量感知调度,使用传统负载均衡
→ 优先分散负载到多个CPU以保证性能

// 场景3:EAS禁用
is_rd_overutilized(rd) = true(总是)
→ 总是使用传统负载均衡策略
→ 兼容性和性能优先

二. 记录唤醒的工作

这个 record_wakee 函数的作用是记录当前任务(current)被唤醒的上下文信息,并通过一些机制来跟踪和统计任务之间的唤醒关系。它主要用于 Linux 内核调度器中,帮助优化任务调度策略,特别是在负载均衡和唤醒亲和性(wake-up affinity)方面。

1. 背景与目的

在 Linux 内核中,任务调度器需要动态调整任务的运行位置(例如在多核系统中选择哪个 CPU 来运行某个任务)。为了做出更优的调度决策,调度器需要了解任务之间的唤醒关系。例如:

  • 如果任务 A 经常唤醒任务 B,那么将任务 A 和任务 B 放在同一个 CPU 上可能会减少缓存失效和上下文切换开销。
  • 如果任务之间的唤醒关系不频繁,则可能不需要特别考虑它们的亲和性。

record_wakee 函数的核心目标就是记录和分析这种唤醒关系。

static void record_wakee(struct task_struct *p)
7483  {
7484  	/*
7485  	 * Only decay a single time; tasks that have less then 1 wakeup per
7486  	 * jiffy will not have built up many flips.
7487  	 */
7488  	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
7489  		current->wakee_flips >>= 1;
7490  		current->wakee_flip_decay_ts = jiffies;
7491  	}
7492  
7493  	if (current->last_wakee != p) {
7494  		current->last_wakee = p;
7495  		current->wakee_flips++;
7496  	}
7497  }

2.分步理解:

​
if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { 

     current->wakee_flips >>= 1; 

     current->wakee_flip_decay_ts = jiffies; 

}
  • 变量解释

    • jiffies:内核中的全局时间计数器,表示自系统启动以来经过的时钟滴答数。
    • HZ:每秒的时钟滴答数(通常为 100、250 或 1000,取决于系统配置)。
    • current->wakee_flip_decay_ts:记录上次衰减唤醒翻转计数的时间戳。
    • current->wakee_flips:记录当前任务的唤醒翻转计数。
  • 逻辑

    • 每隔 HZ 时间(即 1 秒),将 wakee_flips 的值减半(右移一位)。
    • 这是一种指数衰减机制,用于防止唤醒翻转计数无限增长,同时保留最近一段时间内的唤醒活动信息。
    • 如果任务的唤醒频率较低(少于每秒一次),则其 wakee_flips 值会逐渐趋于零。
if (current->last_wakee != p) {
    current->last_wakee = p;
    current->wakee_flips++;
}
  • 变量解释

    • current->last_wakee:记录当前任务最后一次唤醒的目标任务。
    • p:当前被唤醒的任务。
  • 逻辑

    • 如果当前任务唤醒了一个新的任务(p 不等于 last_wakee),则:
      • 更新 last_wakeep
      • 增加 wakee_flips 计数,表示发生了一次唤醒翻转。
    • 如果当前任务连续唤醒同一个任务,则不会增加 wakee_flips

3. 函数的作用

  • 调度器可以利用 wakee_flipslast_wakee 的信息,判断任务之间的唤醒亲和性。
  • 例如,如果某个任务频繁唤醒另一个特定任务,则可以尝试将这两个任务绑定到同一个 CPU 上,以提高性能。

4. 应用场景

该函数的主要应用场景包括:

  • 负载均衡
    • 在多核系统中,调度器可以根据唤醒关系决定是否迁移任务到其他 CPU。
  • 唤醒亲和性
    • 如果任务 A 频繁唤醒任务 B,则可以优先将任务 B 调度到任务 A 所在的 CPU 上,减少跨 CPU 的通信开销。
  • 性能优化
    • 通过减少缓存失效和上下文切换,提升系统的整体性能。

三.怎么判断cpu overhead?

 /*
7045   * overutilized value make sense only if EAS is enabled
7046   */
7047  static inline bool is_rd_overutilized(struct root_domain *rd)
7048  {
7049  	return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
7050  }
  • rd->overutilized 表示根域的真实过载状态

真正的过载判断在 cpu_overutilized() 函数中:

static inline bool cpu_overutilized(int cpu)
{
    unsigned long rq_util_min, rq_util_max;
    int overutilized = -1;

    // 1. Vendor hook 检查
    trace_android_rvh_cpu_overutilized(cpu, &overutilized);
    if (overutilized != -1)
        return overutilized;

    // 2. EAS 检查
    if (!sched_energy_enabled())
        return false;

    // 3. 获取利用率限制
    rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
    rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);

    // 4. 核心判断逻辑
    return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}


unsigned long cpu_util_cfs(int cpu)
8262  {    
        //  获取CPU利用率
8263  	return cpu_util(cpu, NULL, -1, 0);
8264  }




static inline int util_fits_cpu(unsigned long util,
5164  				unsigned long uclamp_min,
5165  				unsigned long uclamp_max,
5166  				int cpu)
5167  {
5168  	unsigned long capacity = capacity_of(cpu);
5169  	unsigned long capacity_orig;
5170  	bool fits, uclamp_max_fits, done = false;
5171  
5172  	trace_android_rvh_util_fits_cpu(util, uclamp_min, uclamp_max, cpu, &fits, &done);
5173  
5174  	if (done)
5175  		return fits;
5176  
5177  	/*
5178  	 * Check if the real util fits without any uclamp boost/cap applied.
5179  	 */
5180  	fits = fits_capacity(util, capacity);
5181  
5182  	if (!uclamp_is_used())
5183  		return fits;
5184  
5185  	/*
5186  	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
5187  	 * uclamp_max. We only care about capacity pressure (by using
5188  	 * capacity_of()) for comparing against the real util.
5189  	 *
5190  	 * If a task is boosted to 1024 for example, we don't want a tiny
5191  	 * pressure to skew the check whether it fits a CPU or not.
5192  	 *
5193  	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
5194  	 * should fit a little cpu even if there's some pressure.
5195  	 *
5196  	 * Only exception is for HW or cpufreq pressure since it has a direct impact
5197  	 * on available OPP of the system.
5198  	 *
5199  	 * We honour it for uclamp_min only as a drop in performance level
5200  	 * could result in not getting the requested minimum performance level.
5201  	 *
5202  	 * For uclamp_max, we can tolerate a drop in performance level as the
5203  	 * goal is to cap the task. So it's okay if it's getting less.
5204  	 */
5205  	capacity_orig = arch_scale_cpu_capacity(cpu);
5206  
5207  	/*
5208  	 * We want to force a task to fit a cpu as implied by uclamp_max.
5209  	 * But we do have some corner cases to cater for..
5210  	 *
5211  	 *
5212  	 *                                 C=z
5213  	 *   |                             ___
5214  	 *   |                  C=y       |   |
5215  	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
5216  	 *   |      C=x        |   |      |   |
5217  	 *   |      ___        |   |      |   |
5218  	 *   |     |   |       |   |      |   |    (util somewhere in this region)
5219  	 *   |     |   |       |   |      |   |
5220  	 *   |     |   |       |   |      |   |
5221  	 *   +----------------------------------------
5222  	 *         CPU0        CPU1       CPU2
5223  	 *
5224  	 *   In the above example if a task is capped to a specific performance
5225  	 *   point, y, then when:
5226  	 *
5227  	 *   * util = 80% of x then it does not fit on CPU0 and should migrate
5228  	 *     to CPU1
5229  	 *   * util = 80% of y then it is forced to fit on CPU1 to honour
5230  	 *     uclamp_max request.
5231  	 *
5232  	 *   which is what we're enforcing here. A task always fits if
5233  	 *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
5234  	 *   the normal upmigration rules should withhold still.
5235  	 *
5236  	 *   Only exception is when we are on max capacity, then we need to be
5237  	 *   careful not to block overutilized state. This is so because:
5238  	 *
5239  	 *     1. There's no concept of capping at max_capacity! We can't go
5240  	 *        beyond this performance level anyway.
5241  	 *     2. The system is being saturated when we're operating near
5242  	 *        max capacity, it doesn't make sense to block overutilized.
5243  	 */
5244  	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
5245  	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
5246  	fits = fits || uclamp_max_fits;
5247  
5248  	/*
5249  	 *
5250  	 *                                 C=z
5251  	 *   |                             ___       (region a, capped, util >= uclamp_max)
5252  	 *   |                  C=y       |   |
5253  	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
5254  	 *   |      C=x        |   |      |   |
5255  	 *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max)
5256  	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
5257  	 *   |     |   |       |   |      |   |
5258  	 *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
5259  	 *   +----------------------------------------
5260  	 *         CPU0        CPU1       CPU2
5261  	 *
5262  	 * a) If util > uclamp_max, then we're capped, we don't care about
5263  	 *    actual fitness value here. We only care if uclamp_max fits
5264  	 *    capacity without taking margin/pressure into account.
5265  	 *    See comment above.
5266  	 *
5267  	 * b) If uclamp_min <= util <= uclamp_max, then the normal
5268  	 *    fits_capacity() rules apply. Except we need to ensure that we
5269  	 *    enforce we remain within uclamp_max, see comment above.
5270  	 *
5271  	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
5272  	 *    need to take into account the boosted value fits the CPU without
5273  	 *    taking margin/pressure into account.
5274  	 *
5275  	 * Cases (a) and (b) are handled in the 'fits' variable already. We
5276  	 * just need to consider an extra check for case (c) after ensuring we
5277  	 * handle the case uclamp_min > uclamp_max.
5278  	 */
5279  	uclamp_min = min(uclamp_min, uclamp_max);
5280  	if (fits && (util < uclamp_min) &&
5281  	    (uclamp_min > get_actual_cpu_capacity(cpu)))
5282  		return -1;
5283  
5284  	return fits;
5285  }

根域过载状态的更新

过载状态通过以下机制维护:

 static inline void check_update_overutilized_status(struct rq *rq)
7062  {
7063  	/*
7064  	 * overutilized field is used for load balancing decisions only
7065  	 * if energy aware scheduler is being used
7066  	 */
7067  
7068  	if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
7069  		set_rd_overutilized(rq->rd, 1);
7070  }

待续.....

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐