linux内核启动过程4：内核运行时-CFANZ编程社区

内核运行时状态

内核初始化流程已经分析完成，如何保持内核进入运行时状态(不退出)，接下来分析跳过的函数arch_call_rest_init：

/* Do the rest non-__init'ed, we're now alive */
        arch_call_rest_init();

进入arch_call_rest_init函数：

		rest_init();

		...

		struct task_struct *tsk;
        int pid;

        rcu_scheduler_starting();

		rcu_scheduler_active = RCU_SCHEDULER_RUNNING;

/*
	#define RCU_SCHEDULER_INACTIVE  0
	#define RCU_SCHEDULER_INIT      1
	#define RCU_SCHEDULER_RUNNING   2
*/

rcu_scheduler_active设置为运行标志

		/*
         * We need to spawn init first so that it obtains pid 1, however
         * the init task will end up wanting to create kthreads, which, if
         * we schedule it before we create kthreadd, will OOPS.
         */
        pid = kernel_thread(kernel_init, NULL, CLONE_FS);
        /* kernel_init ---- 设置内核抢占模式(如mm、rt、dl等初始化)，打开/dev/console，在释放内存之前，需要完成所有异步初始化代码，运行"/init"进程，运行/sbin/init、/etc/init、/bin/init、/bin/sh */

		/* system_state = SYSTEM_RUNNING; */

kernel_init这里最终运行/init、/sbin/init、/etc/init、/bin/init、/bin/sh(systemd安装包内，systemd --switched-root --system… 为(pid)1进程，用户空间进程基于父进程1)，进入kernel_thread函数：

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags          = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal    = (lower_32_bits(flags) & CSIGNAL),
                .stack          = (unsigned long)fn,
                .stack_size     = (unsigned long)arg,
        };
        
        return _do_fork(&args);
}

/*
	struct pid init_struct_pid = {
        .count          = REFCOUNT_INIT(1),
        .tasks          = {
                { .first = NULL },
                { .first = NULL },
                { .first = NULL },
        },              
        .level          = 0, 
        .numbers        = { {
                .nr             = 0,
                .ns             = &init_pid_ns,
        }, }
	};
*/

kernel_init这里创建的是0(pid)进程，从.numbers[0].nr得到init(pid)

		rcu_read_lock(); /* 主要执行禁止内核抢占 + raw_spin_lock，进入无填充(CD=1，NW=0)缓存模式并刷新缓存 ... cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); ... */
        tsk = find_task_by_pid_ns(pid, &init_pid_ns); /* 通过pid找到task_struct结构对象，通过idr机制 */
        set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); /* 启动CPU上的引脚初始化，用于初始化的CPU到非隔离CPU */
        rcu_read_unlock();
        /*  恢复抢占模式，取消读锁

找到通过pid找到task_struct结构对象，初始化CPU到非隔离CPU

		numa_default_policy();
        pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
        //创建kthreadd，pid为2，内核层线程继承与它(它的子级共享进程)
        rcu_read_lock();
        kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
        rcu_read_unlock();

创建kthreadd，pid为2，内核层相关线程继承与它

		system_state = SYSTEM_SCHEDULING;
                                
        complete(&kthreadd_done);
		/*
         * The boot idle thread must execute schedule()
         * at least once to get things moving:
         */
        schedule_preempt_disabled();
        /* Call into cpu_idle with preempt disabled */
        cpu_startup_entry(CPUHP_ONLINE);

kthreadd在内核运行期间不退出，进入cpu_startup_entry函数：

void cpu_startup_entry(enum cpuhp_state state)
{
        arch_cpu_idle_prepare();
        cpuhp_online_idle(state);
        while (1)
                do_idle();
}

进入do_idle函数：

static void do_idle(void)
{
        int cpu = smp_processor_id();
        /*
         * If the arch has a polling bit, we maintain an invariant:
         *
         * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
         * rq->idle). This means that, if rq->idle has the polling bit set,
         * then setting need_resched is guaranteed to cause the CPU to
         * reschedule.
         */
         // 只有所有进程空闲时，才可能调用到idle进程(pid 0)

        __current_set_polling();
        tick_nohz_idle_enter();

        while (!need_resched()) {  
                rmb();

                local_irq_disable();

                if (cpu_is_offline(cpu)) {
                        tick_nohz_idle_stop_tick();
                        cpuhp_report_idle_dead();
                        arch_cpu_idle_dead();
                }

				arch_cpu_idle_enter();
				/*
                 * In poll mode we reenable interrupts and spin. Also if we
                 * detected in the wakeup from idle path that the tick
                 * broadcast device expired for us, we don't want to go deep
                 * idle as we know that the IPI is going to arrive right away.
                 */
                if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
                        tick_nohz_idle_restart_tick();
                        cpu_idle_poll();
                } else {
                        cpuidle_idle_call();
                }
                arch_cpu_idle_exit();
        }

		/*
         * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
         * be set, propagate it into PREEMPT_NEED_RESCHED.
         *
         * This is required because for polling idle loops we will not have had
         * an IPI to fold the state for us.
         */
        preempt_set_need_resched();
        tick_nohz_idle_exit();
        __current_clr_polling();

        /*
         * We promise to call sched_ttwu_pending() and reschedule if
         * need_resched() is set while polling is set. That means that clearing
         * polling needs to be visible before doing these things.
         */
        smp_mb__after_atomic();

        /*
         * RCU relies on this call to be done outside of an RCU read-side
         * critical section.
         */
        flush_smp_call_function_from_idle();
		schedule_idle();

        if (unlikely(klp_patch_pending(current)))
                klp_update_patch_state(current);  
}

进入schedule_idle函数：

void __sched schedule_idle(void)
{
        /*
         * As this skips calling sched_submit_work(), which the idle task does
         * regardless because that function is a nop when the task is in a
         * TASK_RUNNING state, make sure this isn't used someplace that the
         * current task can be in any other state. Note, idle is always in the
         * TASK_RUNNING state.
         */
        WARN_ON_ONCE(current->state);
        do {
                __schedule(false);
        } while (need_resched());
}

在__schedule完成可执行进程到下一个可执行进程的切换，返回用户空间，cond_resched()从系统调用或异常返回到用户空间，schedule()从中断处理程序返回用户空间。

arch_call_rest_init函数初始化后：
1. 启动了(pid)0进程，执行/init启动应用服务(pid)1进程systemd(应用层父进程，systemd安装包中分析)；
2. 启动(pid)2进程kthreadd(内核层(线程)的父进程)。

其中(pid)0进程属于(pid)1进程和(pid)2进程的父进程，(pid)1进程为应用空间父进程，(pid)2进程为内核空间父进程。