之前遇到一个耗电问题,最后发现是/proc/sys/kernel/sched_boost节点设置异常,一直处于boost状态。导致所有场景功耗上升。
现在总结一下sched_boost的相关知识。
Sched_Boost
sched_boost主要是通过影响Task placement的方式,来进行boost。它属于QTI EAS中的一部分。
默认task placement policy
计算每个cpu的负载,并将task分配到负载最轻的cpu上。如果有多个cpu的负载相同(一般是都处于idle),那么就会把task分配到系统中capacity最大的cpu上。
设置sched_boost
通过设置节点:/proc/sys/kernel/sched_boost 或者内核调用sched_set_boost()函数,可以进行sched_boost,并且在分配任务时,忽略对energy的消耗。
boost一旦设置之后,就必须显示写0来关闭。同时也支持个应用同时调用设置,设置会选择boost等级最高的生效; 而当所有应用都都关闭boost时,boost才会真正失效。
boost等级
sched_boost一共有4个等级,除了0代表关闭boost以外,其他3个等级灵活地控制功耗和性能的不同倾向程度。
在通过节点设置,会调用sched_boost_handler
{
.procname = "sched_boost",
.data = sysctl_sched_boost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_boost_handler,
.extra1 = neg_three,
.extra2 = three,
},
经过verify之后,调用_sched_set_boost来设置boost。
int sched_boost_handler(struct ctl_table table, int write,
void __user buffer, size_t lenp,
loff_t ppos)
{
int ret;
unsigned int data = (unsigned int )table->data;
mutex_lock(boost_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
if (verify_boost_params(data))
_sched_set_boost(data);
else
ret = -EINVAL;
done:
mutex_unlock(boost_mutex);
return ret;
而通过内核调用的方式,同样最后也是调用_sched_set_boost来设置boost。
int sched_set_boost(int type)
{
int ret = 0;
mutex_lock(boost_mutex);
if (verify_boost_params(type))
_sched_set_boost(type);
else
ret = -EINVAL;
mutex_unlock(boost_mutex);
return ret;
}
接下来,我们看关键的设置函数_sched_set_boost:
static void _sched_set_boost(int type)
{
if (type == 0) //通过type参数判断是否enable/disable boost
sched_boost_disable_all(); //(1)disable all boost
else if (type > 0)
sched_boost_enable(type); //(2) enable boost
else
sched_boost_disable(-type); //(3) disable boost
/
sysctl_sched_boost holds the boost request from
user space which could be different from the
effectively enabled boost. Update the effective
boost here.
/
sched_boost_type = sched_effective_boost();
sysctl_sched_boost = sched_boost_type;
set_boost_policy(sysctl_sched_boost); //(4) 设置boost policy
trace_sched_set_boost(sysctl_sched_boost);
}
首先看一下sched_boost的4个用于控制配置的结构体:
其中refcount来记录设置的次数。enter函数表示切换到该boost配置的动作;exit则是退出该boost配置的动作。
static struct sched_boost_data sched_boosts【】 = {
【NO_BOOST】 = {
.refcount = 0,
.enter = sched_no_boost_nop,
.exit = sched_no_boost_nop,
},
【FULL_THROTTLE_BOOST】 = {
.refcount = 0,
.enter = sched_full_throttle_boost_enter,
.exit = sched_full_throttle_boost_exit,
},
【CONSERVATIVE_BOOST】 = {
.refcount = 0,
.enter = sched_conservative_boost_enter,
.exit = sched_conservative_boost_exit,
},
【RESTRAINED_BOOST】 = {
.refcount = 0,
.enter = sched_restrained_boost_enter,
.exit = sched_restrained_boost_exit,
},
};
(1)disable all boost
调用除no boost外,所有boost配置的exit函数并且将他们的refcount清0。
#define SCHED_BOOST_START FULL_THROTTLE_BOOST
#define SCHED_BOOST_END (RESTRAINED_BOOST + 1
static void sched_boost_disable_all(void)
{
int i;
for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) {
if (sched_boosts【i】.refcount > 0) {
sched_boosts【i】.exit();
sched_boosts【i】.refcount = 0;
}
}
}
(2) enable boost
refcount记录调用次数+;
由于sched+boost支持多应用同时调用的,所以在设置boost之前,要先检查当前有效的boost配置。
优先级是No boost > Full Throttle > Conservative > Restrained。
static void sched_boost_enable(int type)
{
struct sched_boost_data sb = sched_boosts【type】;
int next_boost, prev_boost = sched_boost_type;
sb->refcount++; //refcount记录次数+1
if (sb->refcount != 1)
return;
/
This boost enable request did not come before.
Take this new request and find the next boost
by aggregating all the enabled boosts. If there
is a change, disable the previous boost and enable
the next boost.
/
next_boost = sched_effective_boost(); //设置boost之前,检查当前有效的boost配置
if (next_boost == prev_boost)
return;
sched_boosts【prev_boost】.exit(); //调用之前配置的exit,退出之前的boost
sched_boosts【next_boost】.enter(); //调用现在配置的enter,进入当前boost状态
通过检查refcount,来确认当前有效的boost。
static int sched_effective_boost(void)
{
int i;
/
The boosts are sorted in descending order by
priority.
/
for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) {
if (sched_boosts【i】.refcount >= 1)
return i;
}
return NO_BOOST;
}
(3)disable boost
同样假如是disable boost的话,就会相应的对refcount--,并且调用当前boost类型的exit函数来退出boost。
因为sched_boost支持多种boost同时开启,并按优先级设置。所以当disable一种boost时,最后通过检查当前有效的boost来进入余下优先级高的boost模式。
static void sched_boost_disable(int type)
{
struct sched_boost_data sb = sched_boosts【type】;
int next_boost;
if (sb->refcount <= 0)
return;
sb->refcount--;
if (sb->refcount)
return;
/
This boost's refcount becomes zero, so it must
be disabled. Disable it first and then apply
the next boost.
/
sb->exit();
next_boost = sched_effective_boost();
sched_boosts【next_boost】.enter();
}
(4)设置boost policy
在最后一步中,设置policy来体现task是否需要进行up migrate。
如下是sched_boost不同等级对应的up migrate迁移策略。
Full throttle和Conservative:SCHED_BOOST_ON_BIG---在进行task placement时,仅考虑capacity最大的cpu core
无:SCHED_BOOST_ON_ALL---在进行task placement时,仅不考虑capacity最小的cpu core
No Boost和Restrained:SCHED_BOOST_NONE---正常EAS
/
Scheduler boost type and boost policy might at first seem unrelated,
however, there exists a connection between them that will allow us
to use them interchangeably during placement decisions. We'll explain
the connection here in one possible way so that the implications are
clear when looking at placement policies.
When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
neither be none nor RESTRAINED.
/
static void set_boost_policy(int type)
{
if (type == NO_BOOST || type == RESTRAINED_BOOST) { //conservative和full throttle模式才会进行向上迁移
boost_policy = SCHED_BOOST_NONE;
return;
}
if (boost_policy_dt) {
boost_policy = boost_policy_dt;
return;
}
if (min_possible_efficiency != max_possible_efficiency) { //左边是cpu中efficiency最小值,右边为最大值。big.LITTLE架构应该恒成立
boost_policy = SCHED_BOOST_ON_BIG;
return;
}
boost_policy = SCHED_BOOST_ON_ALL;
}
接下来详细分析3种boost设置的原理:
Full Throttle
full throttle(全速)模式下的sched boost,主要有如下2个动作:
(1)core control
(2)freq aggregation
static void sched_full_throttle_boost_enter(void)
{
core_ctl_set_boost(true); //(1)core control
walt_enable_frequency_aggregation(true); //(2)freq aggregation
}
(1)core control:isoloate/unisoloate cpu cores;enable boost时,开所有cpu core
int core_ctl_set_boost(bool boost)
{
unsigned int index = 0;
struct cluster_data cluster;
unsigned long flags;
int ret = 0;
bool boost_state_changed = false;
if (unlikely(!initialized))
return 0;
spin_lock_irqsave(state_lock, flags);
for_each_cluster(cluster, index) { //修改并记录每个cluster的boost状态
//代码效果参考:http://www.zidongmutanji.com/zsjx/206723.html
if (boost) {boost_state_changed = !cluster->boost;
++cluster->boost;
} else {
if (!cluster->boost) {
ret = -EINVAL;
break;
} else {
--cluster->boost;
boost_state_changed = !cluster->boost;
}
}
}
spin_unlock_irqrestore(state_lock, flags);
if (boost_state_changed) {
index = 0;
for_each_cluster(cluster, index) //针对每个cluster,apply boost设置
apply_need(cluster);
}
trace_core_ctl_set_boost(cluster->boost, ret);
return ret;
}
EXPORT_SYMBOL(core_ctl_set_boost);
static void apply_need(struct cluster_data cluster)
{
if (eval_need(cluster)) //判断是否需要
wake_up_core_ctl_thread(cluster); //唤//代码效果参考:http://www.zidongmutanji.com/zsjx/576109.html
醒cluster的core control thread}
具体如何判断的:
enable boost时:判断是否需要unisolate cpu,
disable boost时:判断need_cpus < active_cpus是否成立。
并且与上一次更新的间隔时间满足 > delay time。
static bool eval_need(struct cluster_data cluster)
{
unsigned long flags;
struct cpu_data c;
unsigned int need_cpus = 0, last_need, thres_idx;
int ret = 0;
bool need_flag = false;
unsigned int new_need;
s64 now, elapsed;
if (unlikely(!cluster->inited))
return 0;
spin_lock_irqsave(state_lock, flags);
if (cluster->boost || !cluster->enable) {
need_cpus = cluster->max_cpus; //当enable boost时,设置need_cpus为所有cpu
} else {
cluster->active_cpus = get_active_cpu_count(cluster); //当disable boost时,首先获取active的cpu
thres_idx = cluster->active_cpus ? cluster->active_cpus //代码效果参考:http://www.zidongmutanji.com/zsjx/396195.html
- 1 : 0;list_for_each_entry(c, cluster->lru, sib) {
bool old_is_busy = c->is_busy;
if (c->busy >= cluster->busy_up_thres【thres_idx】 ||
sched_cpu_high_irqload(c->cpu))
c->is_busy = true;
else if (c->busy busy_down_thres【thres_idx】)
c->is_busy = false;
trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy,
c->is_busy);
need_cpus += c->is_busy;
}
need_cpus = apply_task_need(cluster, need_cpus); //根据task需要,计算need_cpus
}
new_need = apply_limits(cluster, need_cpus); //限制need_cpus范围:cluster->min_cpus <= need_cpus max_cpus
need_flag = adjustment_possible(cluster, new_need); //()enable boost时:判断是否需要unisolate cpu; disable boost时:判断need_cpus < active_cpus是否成立
last_need = cluster->need_cpus;
now = ktime_to_ms(ktime_get());
if (new_need > cluster->active_cpus) {
ret = 1; //enable boost
} else {
/
When there is no change in need and there are no more
active CPUs than currently needed, just update the
need time stamp and return. //当需要的cpu没有变化时,只需要更新时间戳,然后return
*/
if (new_need == last_need new_need == cluster->active_cpus) {
cluster->need_ts = now;
spin_unlock_irqrestore(