进程创建与CFS调度-CSDN博客

本文详细解析了进程创建过程中，如何通过CFS调度器初始化并加入运行队列。从do_fork函数出发，深入探讨sched_fork和task_fork_fair函数的作用，以及进程状态的设置和调度策略的调整。并通过wake_up_new_task函数，展示了新进程如何被唤醒并最终加入到调度器管理的流程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

进程创建

在进程创建时，调用do_fork函数来创建新进程，那么和调度相关的操作主要有两个，一个是sched_fork，这是对一个进程进行调度的初始化，另外一个就是wake_up_new_task，这个是把刚刚创建的子进程唤醒加入到调度器中管理。
首先来看sched_fork函数，调用流为do_fork–>copy_process–>sched_fork。

 /*
  * fork()/clone()-time setup:
  */
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
     unsigned long flags;
     int cpu = get_cpu();
 
     __sched_fork(clone_flags, p);   //调度相关结构体的初始化操作函数
     /*
      * We mark the process as running here. This guarantees that
      * nobody will actually run it, and a signal or other external
      * event cannot wake it up and insert it on the runqueue either.
      */
     p->state = TASK_RUNNING;       //设置进程状态为RUNNING
 
     /*
      * Make sure we do not leak PI boosting priority to the child.
      */
     p->prio = current->normal_prio;  //设置优先级为普通优先级
 
     /*
      * Revert to default priority/policy on fork if requested.
      */
     if (unlikely(p->sched_reset_on_fork)) {
         if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
             p->policy = SCHED_NORMAL;
             p->static_prio = NICE_TO_PRIO(0);
             p->rt_priority = 0;
         } else if (PRIO_TO_NICE(p->static_prio) < 0)
             p->static_prio = NICE_TO_PRIO(0);
 
         p->prio = p->normal_prio = __normal_prio(p);
         set_load_weight(p);
 
         /*
          * We don't need the reset flag anymore after the fork. It has
          * fulfilled its duty:
          */
         p->sched_reset_on_fork = 0;     //以上这段是为了判断是否要重置调度策略
     }
 
     if (dl_prio(p->prio)) {
         put_cpu();
         return -EAGAIN;
     } else if (rt_prio(p->prio)) {
         p->sched_class = &rt_sched_class;
     } else {
         p->sched_class = &fair_sched_class;  //普通进程设置调度类为CFS调度器
     }
 
     if (p->sched_class->task_fork)
         p->sched_class->task_fork(p);  //执行调度类中的task_fork回调
 
     /*
      * The child is not yet in the pid-hash so no cgroup attach races,
      * and the cgroup is pinned to this child due to cgroup_fork()
      * is ran before sched_fork().
      *
      * Silence PROVE_RCU.
      */
     raw_spin_lock_irqsave(&p->pi_lock, flags);
     set_task_cpu(p, cpu);            //设置子进程的cpu为父进程的cpu
     raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
     if (likely(sched_info_on()))
         memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP)
     p->on_cpu = 0;
 #endif
     init_task_preempt_count(p);    //进程抢占标志初始化
 #ifdef CONFIG_SMP
     plist_node_init(&p->pushable_tasks, MAX_PRIO);
     RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
 
     put_cpu();
     return 0;
 }

下面来看CFS调度器中task_fork的实现，调用流do_fork–>copy_process–>sched_fork–>task_fork_fair：

static void task_fork_fair(struct task_struct *p)
{
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &p->se, *curr;
    int this_cpu = smp_processor_id();
    struct rq *rq = this_rq();
    unsigned long flags;

    raw_spin_lock_irqsave(&rq->lock, flags);

    update_rq_clock(rq);

    cfs_rq = task_cfs_rq(current);
    curr = cfs_rq->curr;

    /*
     * Not only the cpu but also the task_group of the parent might have
     * been changed after parent->se.parent,cfs_rq were copied to
     * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
     * of child point to valid ones.
     */
    rcu_read_lock();
    __set_task_cpu(p, this_cpu);
    rcu_read_unlock();

    update_curr(cfs_rq);

    if (curr)
        se->vruntime = curr->vruntime;
    place_entity(cfs_rq, se, 1);

    if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
        /*
         * Upon rescheduling, sched_class::put_prev_task() will place
         * 'current' within the tree based on its new key value.
         */
        swap(curr->vruntime, se->vruntime);
        resched_curr(rq);
    }

    se->vruntime -= cfs_rq->min_vruntime;

    raw_spin_unlock_irqrestore(&rq->lock, flags);
}

这个函数主要实现的是如下几个步骤：
（1）更新runqueue clock
（2）设置当前进程cpu为父进程的CPU
（3）update_curr是CFS调度器中核心函数，更新父进程的sum_exec_runtime，vruntime和runqueue的min_vruntime
（4）place_entity对于新创建的进程进行惩罚，vruntime会加上一个值，放置新创建进程恶意占有CPU

加入运行队列（enqueue操作）

上面介绍的就是进程创建关于调度的初始化过程，那么初始化完成后，下面就要把新的子进程加入到调度器中，涉及的函数如下do_fork–>wake_up_new_task：

 void wake_up_new_task(struct task_struct *p)
 {
     unsigned long flags;
     struct rq *rq;
 
     raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
     /*
      * Fork balancing, do it here and not earlier because:
      *  - cpus_allowed can change in the fork path
      *  - any previously selected cpu might disappear through hotplug
      */
     set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));   //子进程重新选择runqueue和cpu，相当于进行了一次负载均衡处理
 #endif
 
     /* Initialize new task's runnable average */
     init_task_runnable_average(p);                   //依据权重初始化子进程的时间片和负载贡献
     rq = __task_rq_lock(p);
     activate_task(rq, p, 0);                         //把子进程加入到runqueue，这是该函数的关键核心
     p->on_rq = TASK_ON_RQ_QUEUED;
     trace_sched_wakeup_new(p, true);
     check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
     if (p->sched_class->task_woken)
         p->sched_class->task_woken(rq, p);
 #endif
     task_rq_unlock(rq, p, &flags);
 }

void activate_task(struct rq *rq, struct task_struct *p, int flags)
{   
    if (task_contributes_to_load(p))
        rq->nr_uninterruptible--;
    
    enqueue_task(rq, p, flags);
}

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
    update_rq_clock(rq);
    sched_info_queued(rq, p);
    p->sched_class->enqueue_task(rq, p, flags);
}   

static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &p->se;

    for_each_sched_entity(se) {
        if (se->on_rq)
            break;
        cfs_rq = cfs_rq_of(se);
        enqueue_entity(cfs_rq, se, flags);  //调度实体加入到runqueue中

        /*
         * end evaluation on encountering a throttled cfs_rq
         *
         * note: in the case of encountering a throttled cfs_rq we will
         * post the final h_nr_running increment below.
        */
        if (cfs_rq_throttled(cfs_rq))
            break;
        cfs_rq->h_nr_running++;

        flags = ENQUEUE_WAKEUP;
    }

    for_each_sched_entity(se) {
        cfs_rq = cfs_rq_of(se);
        cfs_rq->h_nr_running++;

        if (cfs_rq_throttled(cfs_rq))
            break;

        update_cfs_shares(cfs_rq);      //更新cfs shares
        update_entity_load_avg(se, 1);  //更新调度实体负载和runqueue负载，在实际上在上面的enqueue_entity也会执行这两步
    }

    if (!se) {
        update_rq_runnable_avg(rq, rq->nr_running);
        add_nr_running(rq, 1);
    }
    hrtick_update(rq);
}

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    /*
     * Update the normalized vruntime before updating min_vruntime
     * through calling update_curr().
     */
    if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
        se->vruntime += cfs_rq->min_vruntime;  //新创建进程加上一个min_vruntime

    /*
     * Update run-time statistics of the 'current'.
     */
    update_curr(cfs_rq);  //更新子进程runqueue对应的当前进程相关的时间信息和vruntime信息
    enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);  //子进程加入到对应runqueue的负载计算中
    account_entity_enqueue(cfs_rq, se);
    update_cfs_shares(cfs_rq);

    if (flags & ENQUEUE_WAKEUP) {
        place_entity(cfs_rq, se, 0); //对于刚刚唤醒的进程进行补偿，vruntime减去一个值，提高优先级执行
        enqueue_sleeper(cfs_rq, se);
    }

    update_stats_enqueue(cfs_rq, se);
    check_spread(cfs_rq, se);
    if (se != cfs_rq->curr)
        __enqueue_entity(cfs_rq, se); //加入到runqueue中的rb tree的处理
    se->on_rq = 1;

    if (cfs_rq->nr_running == 1) {
        list_add_leaf_cfs_rq(cfs_rq);
        check_enqueue_throttle(cfs_rq);
    }
}

经历了以上这些操作以后，一个进程的创建到加入runqueue的过程就完成了，后续该进程就开始接受CFS调度器的调度了。