latency-nice优先级补丁源码分析

1. 概述

2022年3月11日，Vincent Guittot写道：基于2020年Parth提供的[1-4]系列latency nice相关的patches，他重启了相关工作，提供了新的一组有关latency nice priority的patches。解决了CFS调度器中一个进程是否可以抢占当前正在运行的进程的问题。

2. 设计思想

latency-nice优先级其实和nice值类似，但latency-nice用于表示进程对latency延迟的容忍情况。比如，latency-nice值范围[-20,19]，其中**latency_nice=-20的进程A相比latency_nice=+19的进程B而言，进程A需要更小的延迟特性来维持正常运行，调度器则需要优先考虑运行进程A**。

latency_nice通过从用户态获取latency延迟需求，然后对CFS SCHED_CLASS进行作用，达到控制的目的。

3. 代码实现

[PATCH 1/6]

主要作用：

struct task_struct结构体增加变量 int latency_nice；
/proc/<pid>/sched文件中显示该进程的 latency_nice；
定义latency_nice取值范围[-20,19]；
定义 DEFAULT_LATENCY_NICE 值为0；

Signed-off-by: Parth Shah <parth@linux.ibm.com>
[rebase]
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---include/linux/sched.h |  1 +kernel/sched/debug.c  |  1 +kernel/sched/sched.h  | 18 ++++++++++++++++++3 files changed, 20 insertions(+)diff --git a/include/linux/sched.h b/include/linux/sched.h
index 508b91d57470..2aa889a59054 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -779,6 +779,7 @@ struct task_struct {int                static_prio;int             normal_prio;unsigned int            rt_priority;
+  int             latency_nice;struct sched_entity        se;struct sched_rt_entity       rt;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 102d6f70e84d..5d76a8927888 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1043,6 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,#endifP(policy);P(prio);
+  P(latency_nice);if (task_has_dl_policy(p)) {P(dl.runtime);P(dl.deadline);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9b33ba9c3c42..456ad2159eb1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -105,6 +105,24 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);*/#define NS_TO_JIFFIES(TIME)    ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))+/*
+ * Latency nice is meant to provide scheduler hints about the relative
+ * latency requirements of a task with respect to other tasks.
+ * Thus a task with latency_nice == 19 can be hinted as the task with no
+ * latency requirements, in contrast to the task with latency_nice == -20
+ * which should be given priority in terms of lower latency.
+ */
+#define MAX_LATENCY_NICE  19
+#define MIN_LATENCY_NICE  -20
+
+#define LATENCY_NICE_WIDTH    \
+  (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1)
+
+/*
+ * Default tasks should be treated as a task with latency_nice = 0.
+ */
+#define DEFAULT_LATENCY_NICE  0
+/** Increase resolution of nice-level calculations for 64-bit architectures.* The extra resolution improves shares distribution and load balancing of
--

[PATCH 2/6]

主要作用：

0号进程(也叫idle进程/swapper进程)的初始化 latency_nice = 0；
子进程的 latency_nice 继承父进程的 latency_nice值；
子进程如果设置了sched_reset_on_fork，则设置其 latency_nice = 0；

Signed-off-by: Parth Shah <parth@linux.ibm.com>
[rebase]
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---init/init_task.c    | 1 +kernel/sched/core.c | 4 ++++2 files changed, 5 insertions(+)diff --git a/init/init_task.c b/init/init_task.c
index 73cc8f03511a..2afa249c253b 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,6 +78,7 @@ struct task_struct init_task.prio        = MAX_PRIO - 20,.static_prio   = MAX_PRIO - 20,.normal_prio   = MAX_PRIO - 20,
+  .latency_nice   = 0,.policy        = SCHED_NORMAL,.cpus_ptr   = &init_task.cpus_mask,.user_cpus_ptr  = NULL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d863d7f6ad7..157eef880d1d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4393,6 +4393,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)*/p->prio = current->normal_prio;+   /* Propagate the parent's latency requirements to the child as well */
+  p->latency_nice = current->latency_nice;
+uclamp_fork(p);/*
@@ -4409,6 +4412,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)p->prio = p->normal_prio = p->static_prio;set_load_weight(p, false);+        p->latency_nice = DEFAULT_LATENCY_NICE;/** We don't need the reset flag anymore after the fork. It has* fulfil

[PATCH 3/6]

主要作用：

在sched_attr结构体中增加sched_latency_nice，并且支持通过sched_setattr/sched_getaatr等系统调用来修改/获取；
新增flag标志SCHED_FLAG_LATENCY_NICE，每次系统调用修改sched_latency_nice后会在内核态修改相应的latency_nice/latency_prio；

Signed-off-by: Parth Shah <parth@linux.ibm.com>
[rebase and add a dedicated __setscheduler_latency ]
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---include/uapi/linux/sched.h       |  4 +++-include/uapi/linux/sched/types.h | 19 +++++++++++++++++++kernel/sched/core.c              | 26 ++++++++++++++++++++++++++tools/include/uapi/linux/sched.h |  4 +++-4 files changed, 51 insertions(+), 2 deletions(-)diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..b2e932c25be6 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {#define SCHED_FLAG_KEEP_PARAMS      0x10#define SCHED_FLAG_UTIL_CLAMP_MIN   0x20#define SCHED_FLAG_UTIL_CLAMP_MAX   0x40
+#define SCHED_FLAG_LATENCY_NICE       0x80#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {SCHED_FLAG_RECLAIM      | \SCHED_FLAG_DL_OVERRUN        | \SCHED_FLAG_KEEP_ALL      | \
-            SCHED_FLAG_UTIL_CLAMP)
+           SCHED_FLAG_UTIL_CLAMP      | \
+           SCHED_FLAG_LATENCY_NICE)#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbf..0aa4e3b6ed59 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -10,6 +10,7 @@ struct sched_param {#define SCHED_ATTR_SIZE_VER0 48  /* sizeof first published struct */#define SCHED_ATTR_SIZE_VER1 56  /* add: util_{min,max} */
+#define SCHED_ATTR_SIZE_VER2  60  /* add: latency_nice *//** Extended scheduling parameters data structure.
@@ -98,6 +99,22 @@ struct sched_param {* scheduled on a CPU with no more capacity than the specified value.** A task utilization boundary can be reset by setting the attribute to -1.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_latency_nice  task's latency_nice value
+ *
+ * The latency_nice of a task can have any value in a range of
+ * [LATENCY_NICE_MIN..LATENCY_NICE_MAX].
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task with lower latency requirements as opposed to the task with
+ * higher latency_nice.*/struct sched_attr {__u32 size;
@@ -120,6 +137,8 @@ struct sched_attr {__u32 sched_util_min;__u32 sched_util_max;+    /* latency requirement hints */
+  __s32 sched_latency_nice;};#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 157eef880d1d..3edba1a38ecb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7219,6 +7219,16 @@ static void __setscheduler_params(struct task_struct *p,p->rt_priority = attr->sched_priority;p->normal_prio = normal_prio(p);set_load_weight(p, true);
+
+}
+
+static void __setscheduler_latency(struct task_struct *p,
+      const struct sched_attr *attr)
+{+  if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {+      p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
+      set_latency_weight(p);
+  }}/*
@@ -7345,6 +7355,13 @@ static int __sched_setscheduler(struct task_struct *p,return retval;}+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {+      if (attr->sched_latency_nice > MAX_LATENCY_NICE)
+          return -EINVAL;
+      if (attr->sched_latency_nice < MIN_LATENCY_NICE)
+          return -EINVAL;
+  }
+if (pi)cpuset_read_lock();@@ -7379,6 +7396,9 @@ static int __sched_setscheduler(struct task_struct *p,goto change;if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)goto change;
+      if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+          attr->sched_latency_nice != p->latency_nice)
+          goto change;p->sched_reset_on_fork = reset_on_fork;retval = 0;
@@ -7467,6 +7487,7 @@ static int __sched_setscheduler(struct task_struct *p,__setscheduler_params(p, attr);__setscheduler_prio(p, newprio);}
+  __setscheduler_latency(p, attr);__setscheduler_uclamp(p, attr);if (queued) {@@ -7677,6 +7698,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr
*asize < SCHED_ATTR_SIZE_VER1)return -EINVAL;+  if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
+      size < SCHED_ATTR_SIZE_VER2)
+      return -EINVAL;/** XXX: Do we want to be lenient like existing syscalls; or do we want* to be strict and return an error on out-of-bounds values?
@@ -7914,6 +7938,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *,
uattr,get_params(p, &kattr);kattr.sched_flags &= SCHED_FLAG_ALL;+ kattr.sched_latency_nice = p->latency_nice;
+#ifdef CONFIG_UCLAMP_TASK/** This could race with another potential updater, but this is fine
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
index 3bac0a8ceab2..ecc4884bfe4b 100644
--- a/tools/include/uapi/linux/sched.h
+++ b/tools/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {#define SCHED_FLAG_KEEP_PARAMS      0x10#define SCHED_FLAG_UTIL_CLAMP_MIN   0x20#define SCHED_FLAG_UTIL_CLAMP_MAX   0x40
+#define SCHED_FLAG_LATENCY_NICE       0X80#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {SCHED_FLAG_RECLAIM      | \SCHED_FLAG_DL_OVERRUN        | \SCHED_FLAG_KEEP_ALL      | \
-            SCHED_FLAG_UTIL_CLAMP)
+           SCHED_FLAG_UTIL_CLAMP      | \
+           SCHED_FLAG_LATENCY_NICE)#endif /* _UAPI_LINUX_SCHED_H */
--

[PATCH 4/6]

主要作用：

类似于NICE值系统，使用CAP_SYS_NICE做安全检查，当非root用户尝试减小进程的latency_nice将会返回-EPERM；

Signed-off-by: Parth Shah <parth@linux.ibm.com>
[rebase]
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---kernel/sched/core.c | 4 ++++1 file changed, 4 insertions(+)diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3edba1a38ecb..8f8b102a75c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7360,6 +7360,10 @@ static int __sched_setscheduler(struct task_struct *p,return -EINVAL;if (attr->sched_latency_nice < MIN_LATENCY_NICE)return -EINVAL;
+      /* Use the same security checks as NICE */
+      if (attr->sched_latency_nice < p->latency_nice &&
+          !capable(CAP_SYS_NICE))
+          return -EPERM;}if (pi)
--

[PATCH 5/6]

主要作用：

将latency_nice转化为latency_prio，使用优先级来表示对延迟的敏感度，并且引入latency_weight;
进程拥有更大的latency_weight权重（也即更敏感的latency）且其时间片没有用完的情况下，可抢占当前正在运行的进程；
低latency_weight权重的进程只有在唤醒时可以抢占当前进程，以保证一定的公平性，否则只能等tick到来才能获取slice执行；

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---include/linux/sched.h |  4 ++-init/init_task.c      |  2 +-kernel/sched/core.c   | 32 +++++++++++++++++++----kernel/sched/debug.c  |  2 +-kernel/sched/fair.c   | 60 +++++++++++++++++++++++++++++++++++++++++--kernel/sched/sched.h  | 12 +++++++++6 files changed, 102 insertions(+), 10 deletions(-)diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2aa889a59054..9aeb157e819b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -560,6 +560,8 @@ struct sched_entity {unsigned long         runnable_weight;#endif+    int             latency_weight;
+#ifdef CONFIG_SMP/** Per entity load average tracking.
@@ -779,7 +781,7 @@ struct task_struct {int                static_prio;int             normal_prio;unsigned int            rt_priority;
-   int             latency_nice;
+  int             latency_prio;struct sched_entity        se;struct sched_rt_entity       rt;
diff --git a/init/init_task.c b/init/init_task.c
index 2afa249c253b..e98c71f24981 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,7 +78,7 @@ struct task_struct init_task.prio        = MAX_PRIO - 20,.static_prio   = MAX_PRIO - 20,.normal_prio   = MAX_PRIO - 20,
-   .latency_nice   = 0,
+  .latency_prio   = NICE_WIDTH - 20,.policy      = SCHED_NORMAL,.cpus_ptr   = &init_task.cpus_mask,.user_cpus_ptr  = NULL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8f8b102a75c4..547b0da01efe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1241,6 +1241,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)}}+static void set_latency_weight(struct task_struct *p)
+{+  p->se.latency_weight = sched_latency_to_weight[p->latency_prio];
+}
+#ifdef CONFIG_UCLAMP_TASK/** Serializes updates of utilization clamp values
@@ -4394,7 +4399,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)p->prio = current->normal_prio;/* Propagate the parent's latency requirements to the child as well */
-   p->latency_nice = current->latency_nice;
+  p->latency_prio = current->latency_prio;uclamp_fork(p);@@ -4412,7 +4417,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)p->prio = p->normal_prio = p->static_prio;set_load_weight(p, false);-       p->latency_nice = DEFAULT_LATENCY_NICE;
+      p->latency_prio = NICE_TO_LATENCY(0);/** We don't need the reset flag anymore after the fork. It has* fulfilled its duty:
@@ -4420,6 +4425,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)p->sched_reset_on_fork = 0;}+   /* Once latency_prio is set, update the latency weight */
+  set_latency_weight(p);
+if (dl_prio(p->prio))return -EAGAIN;else if (rt_prio(p->prio))
@@ -7361,7 +7369,7 @@ static int __sched_setscheduler(struct task_struct *p,if (attr->sched_latency_nice < MIN_LATENCY_NICE)return -EINVAL;/* Use the same security checks as NICE */
-       if (attr->sched_latency_nice < p->latency_nice &&
+      if (attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) &&!capable(CAP_SYS_NICE))return -EPERM;}
@@ -7401,7 +7409,7 @@ static int __sched_setscheduler(struct task_struct *p,if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)goto change;if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
-           attr->sched_latency_nice != p->latency_nice)
+          attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))goto change;p->sched_reset_on_fork = reset_on_fork;
@@ -7942,7 +7950,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *,
uattr,get_params(p, &kattr);kattr.sched_flags &= SCHED_FLAG_ALL;-  kattr.sched_latency_nice = p->latency_nice;
+  kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);#ifdef CONFIG_UCLAMP_TASK/*
@@ -10954,6 +10962,20 @@ const u32 sched_prio_to_wmult[40] = {/*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,};+/*
+ * latency weight for wakeup preemption
+ */
+const int sched_latency_to_weight[40] = {+ /* -20 */      1024,       973,       922,       870,       819,
+ /* -15 */       768,       717,       666,       614,       563,
+ /* -10 */       512,       461,       410,       358,       307,
+ /*  -5 */       256,       205,       154,       102,       51,
+ /*   0 */       0,       -51,      -102,      -154,      -205,
+ /*   5 */      -256,      -307,      -358,      -410,      -461,
+ /*  10 */      -512,      -563,      -614,      -666,      -717,
+ /*  15 */      -768,      -819,      -870,      -922,      -973,
+};
+void call_trace_sched_update_nr_running(struct rq *rq, int count){trace_sched_update_nr_running_tp(rq, count);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5d76a8927888..253e52ec73fb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1043,7 +1043,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,#endifP(policy);P(prio);
-   P(latency_nice);
+  P(latency_prio);if (task_has_dl_policy(p)) {P(dl.runtime);P(dl.deadline);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c4bfffe8c2c..506c482a0e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5555,6 +5555,35 @@ static int sched_idle_cpu(int cpu)}#endif+static void set_next_buddy(struct sched_entity *se);
+
+static void check_preempt_from_idle(struct cfs_rq *cfs, struct sched_entity *se)
+{+  struct sched_entity *next;
+
+  if (se->latency_weight <= 0)
+      return;
+
+  if (cfs->nr_running <= 1)
+      return;
+  /*
+   * When waking from idle, we don't need to check to preempt at wakeup
+   * the idle thread and don't set next buddy as a candidate for being
+   * picked in priority.
+   * In case of simultaneous wakeup from idle, the latency sensitive tasks
+   * lost opportunity to preempt non sensitive tasks which woke up
+   * simultaneously.
+   */
+
+  if (cfs->next)
+      next = cfs->next;
+  else
+      next = __pick_first_entity(cfs);
+
+  if (next && wakeup_preempt_entity(next, se) == 1)
+      set_next_buddy(se);
+}
+/** The enqueue_task method is called before nr_running is* increased. Here we update the fair scheduling stats and
@@ -5648,6 +5677,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)if (!task_new)update_overutilized_status(rq);+    if (rq->curr == rq->idle)
+      check_preempt_from_idle(cfs_rq_of(&p->se), &p->se);
+enqueue_throttle:if (cfs_bandwidth_used()) {/*
@@ -5669,8 +5701,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)hrtick_update(rq);}-static void set_next_buddy(struct sched_entity *se);
-/** The dequeue_task method is called before nr_running is* decreased. We remove the task from the rbtree and
@@ -6970,6 +7000,27 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)}#endif /* CONFIG_SMP */+static long wakeup_latency_gran(int latency_weight)
+{+  long thresh = sysctl_sched_latency;
+
+  if (!latency_weight)
+      return 0;
+
+  if (sched_feat(GENTLE_FAIR_SLEEPERS))
+      thresh >>= 1;
+
+  /*
+   * Clamp the delta to stay in the scheduler period range
+   * [-sysctl_sched_latency:sysctl_sched_latency]
+   */
+  latency_weight = clamp_t(long, latency_weight,
+              -1 * NICE_LATENCY_WEIGHT_MAX,
+              NICE_LATENCY_WEIGHT_MAX);
+
+  return (thresh * latency_weight) >> NICE_LATENCY_SHIFT;
+}
+static unsigned long wakeup_gran(struct sched_entity *se){unsigned long gran = sysctl_sched_wakeup_granularity;
@@ -7008,6 +7059,10 @@ static intwakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se){s64 gran, vdiff = curr->vruntime - se->vruntime;
+  int latency_weight = se->latency_weight - curr->latency_weight;
+
+  latency_weight = min(latency_weight, se->latency_weight);
+  vdiff += wakeup_latency_gran(latency_weight);if (vdiff <= 0)return -1;
@@ -7117,6 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int
wake_return;update_curr(cfs_rq_of(se));
+if (wakeup_preempt_entity(se, pse) == 1) {/** Bias pick_next to pick the sched entity that is
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456ad2159eb1..dd92aa9c36f9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -122,6 +122,17 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);* Default tasks should be treated as a task with latency_nice = 0.*/#define DEFAULT_LATENCY_NICE    0
+#define DEFAULT_LATENCY_PRIO  (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static latency [ 0..39 ],
+ * and back.
+ */
+#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO)
+#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO)
+#define NICE_LATENCY_SHIFT    (SCHED_FIXEDPOINT_SHIFT)
+#define NICE_LATENCY_WEIGHT_MAX   (1L << NICE_LATENCY_SHIFT)/** Increase resolution of nice-level calculations for 64-bit architectures.
@@ -2098,6 +2109,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);extern const int     sched_prio_to_weight[40];extern const u32       sched_prio_to_wmult[40];
+extern const int      sched_latency_to_weight[40];/** {de,en}queue flags:
--

[PATCH 6/6]

主要作用：

组调度中，进程可以设置其latency_prio优先级，以便抢占当前进程，而其group组的latency优先级还是原来值；在task group中设置一个latency域，实现该功能。

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---kernel/sched/core.c  | 41 +++++++++++++++++++++++++++++++++++++++++kernel/sched/fair.c  | 32 ++++++++++++++++++++++++++++++++kernel/sched/sched.h |  4 ++++3 files changed, 77 insertions(+)diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 547b0da01efe..e0668652dd24 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10635,6 +10635,30 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,{return sched_group_set_idle(css_tg(css), idle);}
+
+static s64 cpu_latency_read_s64(struct cgroup_subsys_state *css,
+                 struct cftype *cft)
+{+  return css_tg(css)->latency_prio;
+}
+
+static int cpu_latency_write_s64(struct cgroup_subsys_state *css,
+              struct cftype *cft, s64 latency_prio)
+{+  return sched_group_set_latency(css_tg(css), latency_prio);
+}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+                 struct cftype *cft)
+{+  return LATENCY_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+              struct cftype *cft, s64 latency_nice)
+{+  return sched_group_set_latency(css_tg(css), NICE_TO_LATENCY(latency_nice));
+}#endifstatic struct cftype cpu_legacy_files[] = {@@ -10649,6 +10673,11 @@ static struct cftype cpu_legacy_files[] = {.read_s64 = cpu_idle_read_s64,.write_s64 = cpu_idle_write_s64,},
+  {+      .name = "latency",
+      .read_s64 = cpu_latency_read_s64,
+      .write_s64 = cpu_latency_write_s64,
+  },#endif#ifdef CONFIG_CFS_BANDWIDTH{@@ -10866,6 +10895,18 @@ static struct cftype cpu_files[] = {.read_s64 = cpu_idle_read_s64,.write_s64 = cpu_idle_write_s64,},
+  {+      .name = "latency",
+      .flags = CFTYPE_NOT_ON_ROOT,
+      .read_s64 = cpu_latency_read_s64,
+      .write_s64 = cpu_latency_write_s64,
+  },
+  {+      .name = "latency.nice",
+      .flags = CFTYPE_NOT_ON_ROOT,
+      .read_s64 = cpu_latency_nice_read_s64,
+      .write_s64 = cpu_latency_nice_write_s64,
+  },#endif#ifdef CONFIG_CFS_BANDWIDTH{diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 506c482a0e48..cbccef025089 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11496,6 +11496,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group
*parent)goto err;tg->shares = NICE_0_LOAD;
+  tg->latency_prio = DEFAULT_LATENCY_PRIO;init_cfs_bandwidth(tg_cfs_bandwidth(tg));@@ -11594,6 +11595,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,}se->my_q = cfs_rq;
+  se->latency_weight = sched_latency_to_weight[tg->latency_prio];/* guarantee group entities always have weight */update_load_set(&se->load, NICE_0_LOAD);se->parent = parent;
@@ -11724,6 +11726,36 @@ int sched_group_set_idle(struct task_group *tg, long idle)return 0;}+int sched_group_set_latency(struct task_group *tg, long latency_prio)
+{+  int i;
+
+  if (tg == &root_task_group)
+      return -EINVAL;
+
+  if (latency_prio < 0 ||
+      latency_prio > LATENCY_NICE_WIDTH)
+      return -EINVAL;
+
+  mutex_lock(&shares_mutex);
+
+  if (tg->latency_prio == latency_prio) {+      mutex_unlock(&shares_mutex);
+      return 0;
+  }
+
+  tg->latency_prio = latency_prio;
+
+  for_each_possible_cpu(i) {+      struct sched_entity *se = tg->se[i];
+
+      WRITE_ONCE(se->latency_weight, sched_latency_to_weight[latency_prio]);
+  }
+
+  mutex_unlock(&shares_mutex);
+  return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dd92aa9c36f9..885d1c809329 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -429,6 +429,8 @@ struct task_group {/* A positive value indicates that this is a SCHED_IDLE group. */int            idle;
+  /* latency priority of the group. */
+  int         latency_prio;#ifdef CONFIG_SMP/*
@@ -542,6 +544,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long
shares);extern int sched_group_set_idle(struct task_group *tg, long idle);+extern int sched_group_set_latency(struct task_group *tg, long latency);
+#ifdef CONFIG_SMPextern void set_task_rq_fair(struct sched_entity *se,struct cfs_rq *prev, struct cfs_rq *next);
--

4. 总结

latency-nice值其实和现有的nice值系统十分类似，个人认为latency-nice是对Linux实时性较差的一个补充解决方案。而且该方案是一个十分有价值的参考，设想一下：你的定制系统如果需要对某类业务的实时响应十分严格，就可以参考latency-nice这种方法定制你想要的优先级系统。