startup_64
进入arch/x86/kernel/head_64.S,分析startup_64函数:
.text
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
UNWIND_HINT_EMPTY
/*
.macro UNWIND_HINT_EMPTY
UNWIND_HINT sp_reg=ORC_REG_UNDEFINED end=1
.endm
struct unwind_hint {
u32 ip;
s16 sp_offset;
u8 sp_reg;
u8 type;
u8 end;
};
进入startup_64函数,ORC分类器设置为空
leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
/* Sanitize CPU configuration */
call verify_cpu
计算出实模式的物理地址(加载了一个身份映射页表,这些标识映射的页表映射所有内核页面,可能还有所有内存,__end_rodata_aligned+0x3f58),然后校验/清理cpu配置信息。
/*
* Perform pagetable fixups. Additionally, if SME is active, encrypt
* the kernel and retrieve the modifier (SME encryption mask if SME
* is active) to be added to the initial pgdir entry that will be
* programmed into CR3.
*/
leaq _text(%rip), %rdi
pushq %rsi
call __startup_64
popq %rsi
计算出startup_64地址,用于修正/对齐之后的内存,调用__startup_64函数(传入第一个和第二参数):
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
pteval_t *mask_ptr;
bool la57;
int i;
unsigned int *next_pgt_ptr;
la57 = check_la57_support(physaddr);
// 每个全局指针都必须使用fixup_pointer()进行调整,防止因未对全局指针重定位导致的启动崩溃
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
for (;;);
/*
* Compute the delta between the address I am compiled to run at
* and the address I am actually running at.
*/
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_PAGE_MASK)
for (;;);
/* Activate Secure Memory Encryption (SME) if supported and enabled */
sme_enable(bp);
// 如果支持并启用,请激活SME(安全内存加密)
/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();
// 在修正值中包含SME加密掩码
/* Fixup the physical addresses in the page table */
pgd = fixup_pointer(&early_top_pgt, physaddr);
p = pgd + pgd_index(__START_KERNEL_map);
使用fixup_pointer()进行调整全局指针,计算实际地址和负载增量(编译多出来的字节),如启动SME(支持并启动的情况下),修正值中包含SME加密掩码
if (la57) //如果已启动5级分页,
*p = (unsigned long)level4_kernel_pgt; // extern p4d_t level4_kernel_pgt[512];
else
*p = (unsigned long)level3_kernel_pgt; // extern pud_t level3_kernel_pgt[512];
*p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
if (la57) {
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}
pud = fixup_pointer(&level3_kernel_pgt, physaddr);
pud[510] += load_delta;
pud[511] += load_delta;
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
pmd[i] += load_delta;
计算内存页表地址
/*
* Set up the identity mapping for the switchover. These
* entries should *NOT* have the global bit set! This also
* creates a bunch of nonsense entries but that is fine --
* it avoids problems around wraparound.
*/
next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
i = physaddr >> P4D_SHIFT;
p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
} else {
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
}
i = physaddr >> PUD_SHIFT;
pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
pmd_entry &= *mask_ptr;
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT);
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
}
过滤不支持的内核页
pmd = fixup_pointer(level2_kernel_pgt, physaddr);
/* invalidate pages before the kernel image */
for (i = 0; i < pmd_index((unsigned long)_text); i++)
pmd[i] &= ~_PAGE_PRESENT;
/* fixup pages that are part of the kernel image */
for (; i <= pmd_index((unsigned long)_end); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
/* invalidate pages after the kernel image */
for (; i < PTRS_PER_PMD; i++)
pmd[i] &= ~_PAGE_PRESENT;
修复内核text段数据虚拟地址
/*
* Fixup phys_base - remove the memory encryption mask to obtain
* the true physical address.
*/
*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);
移除内存加密掩码以获取真实的地址
if (mem_encrypt_active()) {
vaddr = (unsigned long)__start_bss_decrypted;
vaddr_end = (unsigned long)__end_bss_decrypted;
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
i = pmd_index(vaddr);
pmd[i] -= sme_get_me_mask();
}
}
/*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
*/
return sme_get_me_mask();
从中清除内存加密掩码…bss段,解密部分。bss部分将在稍后的初始化中设置为零,以便更改内存加密后,无需将其归零。返回SME加密掩码(如果SME处于活动状态)以用作初始pgdir条目的修饰符编程到CR3中
popq %rsi
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
跳转到secondary_startup_64函数(secondary_startup_64+0x12)
/* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
orl $X86_CR4_LA57, %ecx
1:
#endif
启用PAE模式、PGE
movq %rcx, %cr4
/* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
movq %rax, %cr3
/* Ensure I am executing from virtual addresses */
movq $1f, %rax
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
设置早期启动阶段4级页表,继续向下跳转
/* Check if nx is implemented */
movl $0x80000001, %eax
cpuid
movl %edx,%edi
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
设置EFER(扩展功能启用寄存器),启动系统调用(system call),跳转到wrmsr
1: wrmsr /* Make changes effective */
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
/* Setup a boot time stack */
movq initial_stack(%rip), %rsp
设置启动时堆栈
/* zero EFLAGS after setting rsp */
pushq $0
popfq
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
lgdt early_gdt_descr(%rip)
切换到新的地址
/* set up data segments */
xorl %eax,%eax
movl %eax,%ds
movl %eax,%ss
movl %eax,%es
/*
* We don't really need to load %fs or %gs, but load them anyway
* to kill any stale realmode selectors. This allows execution
* under VT hardware.
*/
movl %eax,%fs
movl %eax,%gs
eax寄存器清空,设置部分寄存器
/* Set up %gs.
*
* The base of %gs always points to fixed_percpu_data. If the
* stack protector canary is enabled, it is located at %gs:40.
* Note that, on SMP, the boot cpu uses init data section until
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
movl initial_gs(%rip),%eax
movl initial_gs+4(%rip),%edx
wrmsr
/* rsi is pointer to real mode structure with interesting info.
pass it to C */
movq %rsi, %rdi
在SMP上,引导cpu使用init data部分,直到每个cpu区域都已设置
SYM_DATA(initial_code, .quad x86_64_start_kernel)
执行x86_64_start_kernel函数:
asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
{
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
*/
BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
cr4_init_shadow();
内核映像和模块的构建时健全性检查,初始化(本cpu)cr4所运行的模式(x86_64)
/* Kill off the identity-map trampoline */
reset_early_page_tables();
clear_bss();
clear_page(init_top_pgt);
清除除内核符号映射之外的所有早期页面表,清空bss段,清除页部分特征
sme_early_init();
//sme初始化,使用内存加密掩码更新保护映射
kasan_early_init();
//遮蔽不支持的内核页,在kasan中登记顶级页表地址
idt_setup_early_handler();
//初始化idt表
copy_bootdata(__va(real_mode_data));
//拷贝有效数据(旧的启动数据内存释放(空间),删除sme相关的映射)
sme、kasan、idt初始化,拷贝有效数据
/*
* Load microcode early on BSP.
*/
load_ucode_bsp();
// 加载inter microcode
/* set init_top_pgt kernel high mapping*/
init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data);
加载inter microcode,设置内核高地址映射,进入x86_64_start_reservations函数:
void __init x86_64_start_reservations(char *real_mode_data)
{
/* version is always not zero if it is copied */
if (!boot_params.hdr.version)
copy_bootdata(__va(real_mode_data));
x86_early_init_platform_quirks();
//设置i8042键盘(如果存在)
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_INTEL_MID:
x86_intel_mid_early_setup();
//inter x86初始化
break;
default:
break;
}
start_kernel();
}
设置键盘,inter x86初始化部分参数,进入 start_kernel函数(init/main.c):
asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
char *command_line;
char *after_dashes;
set_task_stack_end_magic(&init_task);
//设置工作栈尾标志,用于溢出检测
smp_setup_processor_id();
debug_objects_early_init();
//在早期引导期间调用以初始化哈希buckets和链接,静态对象将对象汇集到轮询列表中
cgroup_init_early();
//初始化cgroup,并初始化任何请求早期初始化的子系统
local_irq_disable();
//禁用中断
early_boot_irqs_disabled = true;
设置工作栈尾标志,初始化debuf相关,初始化cgroup,禁用中断
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them.
*/
boot_cpu_init();
//初始化第一个cpu
page_address_init();
pr_notice("%s", linux_banner);
early_security_init();
setup_arch(&command_line);
//将内核占用的内存保留在_text和__end_of_kernel_reserve符号,确保始终保留第0页,因为在具有L1TF的内容可能会泄露给用户进程,在这一点上,仍然需要从引导加载程序获得一切或BIOS或内核文本
/* boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
# define SECTION_SIZE_BITS 29
# define MAX_PHYSMEM_BITS 36
# else
# define SECTION_SIZE_BITS 26
# define MAX_PHYSMEM_BITS 32
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
#endif
*/
如果我们有OLPC OFW,我们可能会因为reserve_top()存储顶级地址,因此请在初始化ioremap区域之前执行此操作
初始化第一个cpu,初始化架构配置
setup_boot_config(command_line);
//即使我们没有bootconfig选项,也要删除bootconfig数据,以“kernel.”开头的键是通过cmdline传递的
setup_command_line(command_line);
//存储未触及的命令行以供将来参考
setup_nr_cpu_ids();
//arch可以提前设置nr_cpu_ID,通常是多余的
setup_per_cpu_areas();
//分配percpu区域
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();
//设置cpu热插拔,必须在设置每个cpu区域后调用
删除/保存部分配置,设置percpu区域,设置cpu热插拔
build_all_zonelists(NULL);
page_alloc_init();
//设置cpu热插拔回调函数
pr_notice("Kernel command line: %s\n", saved_command_line);
/* parameters may set static keys */
jump_label_init();
//初始化跳转标签
parse_early_param();
after_dashes = parse_args("Booting kernel",
static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, NULL, &unknown_bootoption);
//解析(是否是x64_64)
if (!IS_ERR_OR_NULL(after_dashes))
parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
NULL, set_init_arg);
if (extra_init_args)
parse_args("Setting extra init args", extra_init_args,
NULL, 0, -1, -1, NULL, set_init_arg);
设置cpu热插拔回调函数,初始化跳转标签,解析参数
* These use large bootmem allocations and must precede
* kmem_cache_init()
*/
setup_log_buf(0);
//分配内核打印缓存区
vfs_caches_init_early();
//虚拟文件系统初始化,dcache、inode初始化
sort_main_extable();
//对内核的内置异常表进行排序
trap_init();
//在设置IST条目之前初始化cpu条目区域,应该是任何外部CPU状态的屏障
mm_init();
//设置内核内存分配器
分配内核打印缓存区,虚拟文件系统初始化,对内核的内置异常表进行排序,设置内核内存分配器
ftrace_init();
//设置ftrace过滤器,global_ops
/* trace_printk can be enabled here */
early_trace_init();
// trace分配
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
//调度器初始化
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
//禁用抢占模式
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
radix_tree_init();
//创建radix_tree_node
设置ftrace过滤器,trace分配,调度器初始化,禁用抢占模式,创建radix_tree_node
/*
* Set up housekeeping before setting up workqueues to allow the unbound
* workqueue to take non-housekeeping into account.
*/
housekeeping_init();
/*
* Allow workqueue creation and work item queueing/cancelling
* early. Work item execution depends on kthreads and starts after
* workqueue_init().
*/
workqueue_init_early();
//这是两阶段工作队列子系统初始化的前半部分,在最基本的基础上调用——内存分配、CPU任务和idr上升。它设置了所有的数据结构和系统工作队列并允许早期启动代码创建工作队列和排队/取消工作任务
初始化工作队列
rcu_init();
/* Trace events are available after this */
trace_init();
if (initcall_debug)
initcall_debug_enable();
context_tracking_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
rcu初始化,中断初始化
tick_init();
rcu_init_nohz();
init_timers();
hrtimers_init();
//开启软中断
softirq_init();
timekeeping_init();
//初始化时钟源和公共计时值
开启软中断,初始化时钟源
/*
* For best initial stack canary entropy, prepare it after:
* - setup_arch() for any UEFI RNG entropy and boot cmdline access
* - timekeeping_init() for ktime entropy used in rand_initialize()
* - rand_initialize() to get any arch-specific entropy like RDRAND
* - add_latent_entropy() to get any latent entropy
* - adding command line entropy
*/
rand_initialize();
//初始化随机池子
add_latent_entropy();
add_device_randomness(command_line, strlen(command_line));
boot_init_stack_canary();
time_init();
//初始化TSC并将周期计时器初始化
perf_event_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
初始化随机池子,初始化TSC并将周期计时器初始化
local_irq_enable();
//恢复中断
kmem_cache_init_late();
//kmem缓冲区后续初始化
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
//设置控制台参数(之前已经初始化过)
if (panic_later)
panic("Too many boot %s vars at `%s'", panic_later,
panic_param);
lockdep_init();
恢复中断,kmem缓冲区后续初始化,设置控制台参数
locking_selftest();
/*
* This needs to be called before any devices perform DMA
* operations that might use the SWIOTLB bounce buffers. It will
* mark the bounce buffers as decrypted so that their usage will
* not cause "plain-text" data to be decrypted when accessed.
*/
mem_encrypt_init();
setup_per_cpu_pageset();
//分配每个cpu页面集并初始化它们
numa_policy_init();
acpi_early_init();
//初始化ACPICA并填充acpi命名空间
if (late_time_init)
late_time_init();
sched_clock_init();
//调度时钟初始化
calibrate_delay();
pid_idr_init();
//pid表初始化
anon_vma_init();
分配每个cpu页面集并初始化它们,初始化ACPICA并填充acpi命名空间,调度时钟初始化,pid表初始化
thread_stack_cache_init();
cred_init();
fork_init();
//创建一个可以分配任务结构的slab,指定架构任务缓存,设置最大线程
proc_caches_init();
//sighand_cache、signal_cache、files_cache、fs_cache
uts_ns_init();
buffer_init();
//buffer_head
key_init();
security_init();
dbg_late_init();
vfs_caches_init();
//names_cache
pagecache_init();
//页缓存队列
signals_init();
//信号池
seq_file_init();
//seq_file_cache
proc_root_init();
//fs、driver、fs/nfsd、bus等创建,注册文件系统
nsfs_init();
cpuset_init();
cgroup_init();
taskstats_init_early();
delayacct_init();
poking_init();
...
prevent_tail_call_optimization();
//#define mb() asm volatile("mfence":::"memory")
到这里x86_64_start_kernel执行完成。