内核运行时状态
内核初始化流程已经分析完成,如何保持内核进入运行时状态(不退出),接下来分析跳过的函数arch_call_rest_init:
/* Do the rest non-__init'ed, we're now alive */
arch_call_rest_init();
进入arch_call_rest_init函数:
rest_init();
...
struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
/*
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
*/
rcu_scheduler_active设置为运行标志
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
/* kernel_init ---- 设置内核抢占模式(如mm、rt、dl等初始化),打开/dev/console,在释放内存之前,需要完成所有异步初始化代码,运行"/init"进程,运行/sbin/init、/etc/init、/bin/init、/bin/sh */
/* system_state = SYSTEM_RUNNING; */
kernel_init这里最终运行/init、/sbin/init、/etc/init、/bin/init、/bin/sh(systemd安装包内,systemd --switched-root --system… 为(pid)1进程,用户空间进程基于父进程1),进入kernel_thread函数:
/*
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.stack = (unsigned long)fn,
.stack_size = (unsigned long)arg,
};
return _do_fork(&args);
}
/*
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
.tasks = {
{ .first = NULL },
{ .first = NULL },
{ .first = NULL },
},
.level = 0,
.numbers = { {
.nr = 0,
.ns = &init_pid_ns,
}, }
};
*/
kernel_init这里创建的是0(pid)进程,从.numbers[0].nr得到init(pid)
rcu_read_lock(); /* 主要执行禁止内核抢占 + raw_spin_lock,进入无填充(CD=1,NW=0)缓存模式并刷新缓存 ... cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); ... */
tsk = find_task_by_pid_ns(pid, &init_pid_ns); /* 通过pid找到task_struct结构对象,通过idr机制 */
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); /* 启动CPU上的引脚初始化,用于初始化的CPU到非隔离CPU */
rcu_read_unlock();
/* 恢复抢占模式,取消读锁
找到通过pid找到task_struct结构对象,初始化CPU到非隔离CPU
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
//创建kthreadd,pid为2,内核层线程继承与它(它的子级共享进程)
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
创建kthreadd,pid为2,内核层相关线程继承与它
system_state = SYSTEM_SCHEDULING;
complete(&kthreadd_done);
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE);
kthreadd在内核运行期间不退出,进入cpu_startup_entry函数:
void cpu_startup_entry(enum cpuhp_state state)
{
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
do_idle();
}
进入do_idle函数:
static void do_idle(void)
{
int cpu = smp_processor_id();
/*
* If the arch has a polling bit, we maintain an invariant:
*
* Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
* rq->idle). This means that, if rq->idle has the polling bit set,
* then setting need_resched is guaranteed to cause the CPU to
* reschedule.
*/
// 只有所有进程空闲时,才可能调用到idle进程(pid 0)
__current_set_polling();
tick_nohz_idle_enter();
while (!need_resched()) {
rmb();
local_irq_disable();
if (cpu_is_offline(cpu)) {
tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
arch_cpu_idle_enter();
/*
* In poll mode we reenable interrupts and spin. Also if we
* detected in the wakeup from idle path that the tick
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
cpuidle_idle_call();
}
arch_cpu_idle_exit();
}
/*
* Since we fell out of the loop above, we know TIF_NEED_RESCHED must
* be set, propagate it into PREEMPT_NEED_RESCHED.
*
* This is required because for polling idle loops we will not have had
* an IPI to fold the state for us.
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
__current_clr_polling();
/*
* We promise to call sched_ttwu_pending() and reschedule if
* need_resched() is set while polling is set. That means that clearing
* polling needs to be visible before doing these things.
*/
smp_mb__after_atomic();
/*
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
flush_smp_call_function_from_idle();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
klp_update_patch_state(current);
}
进入schedule_idle函数:
void __sched schedule_idle(void)
{
/*
* As this skips calling sched_submit_work(), which the idle task does
* regardless because that function is a nop when the task is in a
* TASK_RUNNING state, make sure this isn't used someplace that the
* current task can be in any other state. Note, idle is always in the
* TASK_RUNNING state.
*/
WARN_ON_ONCE(current->state);
do {
__schedule(false);
} while (need_resched());
}
在__schedule完成可执行进程到下一个可执行进程的切换,返回用户空间,cond_resched()从系统调用或异常返回到用户空间,schedule()从中断处理程序返回用户空间。
arch_call_rest_init函数初始化后:
1. 启动了(pid)0进程,执行/init启动应用服务(pid)1进程systemd(应用层父进程,systemd安装包中分析);
2. 启动(pid)2进程kthreadd(内核层(线程)的父进程)。
其中(pid)0进程属于(pid)1进程和(pid)2进程的父进程,(pid)1进程为应用空间父进程,(pid)2进程为内核空间父进程。