Linux内核启动过程
多处理器系统
Linux内核启动在单处理器和多处理器系统上有显著区别:单处理器系统只有一个CPU,整个启动过程都在这个CPU上完成;多处理器系统有多个CPU,intel将这些CPU划分为两类:BSP(BootStrap Processor,主处理器,只有一个)和AP(Application Processor,从处理器,有多个),内核先在BSP上启动,再由BSP通过IPI通知AP启动,由于BSP已经完成大部分启动工作,因此AP不需要从头开始,只需要执行少部分启动工作
在计算机引导过程四(vmlinuz保护模式vmlinux.bin)中提到解压后内核起始地址是arch/x86/kernel/head_64.S中的startup_64,在startup_64中有段代码体现了两者的差异
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax // rax = early_top_pgt物理编译地址
jmp 1f // BSP:跳转到1f,startup_64 -> initial_code(对于BSP,是x86_64_start_kernel的地址)
ENTRY(secondary_startup_64) // AP:secondary_startup_64 -> initial_code(对于AP,是start_secondary的地址)
...
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
*/
pushq %rsi
call __startup_secondary_64 // 调用__startup_secondary_64获取加密掩码,返回值rax = 加密掩码(一般为0)
popq %rsi
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(init_top_pgt - __START_KERNEL_map), %rax // rax = init_top_pgt物理编译地址
1:
可以看到BSP执行完secondary_startup_64之前的部分后直接跳转到1f,跳过了从secondary_startup_64到1f之间的部分,而AP从secondary_startup_64开始执行,从1f开始,两者执行代码完全一样
BSP调用栈
BSP调用栈比较清楚,startup_64 -> x86_64_start_kernel
// arch/x86/kernel/head_64.S
startup_64 ->
initial_code -> // x86_64_start_kernel
reset_early_page_tables(); // 清空early_top_pgt,保留kernel映射区
clear_page(init_top_pgt); // 清空init_top_pgt
init_top_pgt[511] = early_top_pgt[511]; // 保留kernel映射区
x86_64_start_reservations ->
start_kernel
AP调用栈
AP调用栈有点复杂,是在kernel_init中确定的,这里我们先列出来,等到kernel_init再详细解释
// arch/x86/realmode/rm/trampoline_64.S
trampoline_start ->
startup_32 ->
movl $pa_trampoline_pgd, %eax
movl %eax, %cr3 // cr3 = pa_trampoline_pgd = trampoline_pgd = init_top_pgt
startup_64 ->
jmpq *tr_start(%rip) // 跳转到tr_start = secondary_startup_64
// arch/x86/kernel/head_64.S
secondary_startup_64 ->
initial_code -> // start_secondary
cpu_startup_entry ->
while (1)
do_idle();
start_kernel
内核进程,pid=0,一般称为0号/idle/swapper进程,是systemd和kthreadd的父进程
start_kernel ->
setup_arch ->
early_alloc_pgt_buf
e820__memblock_setup -> // 将e820_table转换为memblock
memblock_add // 将一块物理内存区域加入到memblock.memory
init_mem_mapping ->
init_memory_mapping(0, ISA_END_ADDRESS); // 创建1MB以下的direct mapping映射区
memory_map_top_down(ISA_END_ADDRESS, end); -> // 创建1MB以上的direct mapping映射区
init_range_memory_mapping ->
init_memory_mapping
load_cr3(swapper_pg_dir); // #define swapper_pg_dir init_top_pgt
__flush_tlb_all(); // 刷新tlb
x86_init.paging.pagetable_init(); -> // native_pagetable_init
paging_init ->
sparse_init // SPARSEMEM模型初始化
arch_call_rest_init ->
rest_init ->
// 创建内核线程kernel_init
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
// 创建内核线程kthreadd
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
cpu_startup_entry ->
while (1)
do_idle();
early_alloc_pgt_buf
void __init early_alloc_pgt_buf(void)
{
unsigned long tables = INIT_PGT_BUF_SIZE;
phys_addr_t base;
base = __pa(extend_brk(tables, PAGE_SIZE));
pgt_buf_start = base >> PAGE_SHIFT;
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
}
void * __init extend_brk(size_t size, size_t align)
{
size_t mask = align - 1;
void *ret;
BUG_ON(_brk_start == 0);
BUG_ON(align & mask);
_brk_end = (_brk_end + mask) & ~mask;
BUG_ON((char *)(_brk_end + size) > __brk_limit);
ret = (void *)_brk_end; // unsigned long _brk_end = (unsigned long)__brk_base;
_brk_end += size; // 使用大小为size的空间
memset(ret, 0, size);
return ret;
}
调用RESERVE_BRK在.brk_reservation中保留大小为INIT_PGT_BUF_SIZE的空间
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
RESERVE_BRK定义了一个函数,函数功能是在.brk_reservation中保留大小为sz的空间
#define RESERVE_BRK(name,sz) \
static void __section(.discard.text) __used notrace \
__brk_reservation_fn_##name##__(void) { \
asm volatile ( \
".pushsection .brk_reservation,\"aw\",@nobits;" \
".brk." #name ":" \
" 1:.skip %c0;" \
" .size .brk." #name ", . - 1b;" \
" .popsection" \
: : "i" (sz)); \
}
.brk_reservation位于.brk
// arch/x86/kernel/vmlinux.lds.S
. = ALIGN(PAGE_SIZE);
.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
. += 64 * 1024; /* 64k alignment slop space */ // 在__brk_base后预留64KB空间
*(.brk_reservation) /* areas brk users have reserved */ // 将.brk_reservation放入预留空间
__brk_limit = .;
}
kernel_init
内核线程,启动用户进程systemd,启动后退出
systemd是第一个用户进程,pid=1,是所有用户进程的父进程
ps aux|grep systemd
kernel_init ->
kernel_init_freeable ->
do_pre_smp_initcalls ->
init_real_mode ->
setup_real_mode ->
// real_mode_header->trampoline_header保存的是pa_trampoline_header = trampoline_header(在汇编中定义)的地址
trampoline_header = (struct trampoline_header *) __va(real_mode_header->trampoline_header);
// tr_start = secondary_startup_64
trampoline_header->start = (u64) secondary_startup_64;
// real_mode_header->trampoline_pgd保存的是pa_trampoline_pgd = trampoline_pgd(在汇编中定义)的地址
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
// trampoline_pgd = init_top_pgt
trampoline_pgd[511] = init_top_pgt[511].pgd;
smp_init ->
cpu_up ->
do_cpu_up ->
_cpu_up ->
cpuhp_up_callbacks ->
cpuhp_invoke_callback ->
cb = bringup ? step->startup.single : step->teardown.single;
ret = cb(cpu); -> // bringup_cpu
__cpu_up ->
smp_ops.cpu_up(cpu, tidle); -> // native_cpu_up
do_boot_cpu ->
// real_mode_header->trampoline_start保存的是pa_trampoline_start = trampoline_start(在汇编中定义)的地址
// start_ip = trampoline_start
unsigned long start_ip = real_mode_header->trampoline_start;
initial_code = (unsigned long)start_secondary; // AP的initial_code为start_secondary
wakeup_cpu_via_init_nmi ->
wakeup_secondary_cpu_via_init ->
// 通过IPI通知AP从start_eip = trampoline_start开始执行
apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
try_to_run_init_process("/sbin/init") // /usr/lib/systemd/systemd
run_init_process
do_execve
do_pre_smp_initcalls
在do_pre_smp_initcalls中调用init_real_mode
static void __init do_pre_smp_initcalls(void)
{
for (fn = __initcall_start; fn < __initcall0_start; fn++)
do_one_initcall(initcall_from_entry(fn)); // 执行.initcallearly.init
}
extern initcall_entry_t __initcall_start[];
extern initcall_entry_t __initcall0_start[];
typedef int (*initcall_t)(void); // 函数指针
typedef initcall_t initcall_entry_t;
static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
{
return *entry;
}
int __init_or_module do_one_initcall(initcall_t fn)
{
ret = fn();
}
#define INIT_CALLS \
__initcall_start = .; \
KEEP(*(.initcallearly.init)) \
INIT_CALLS_LEVEL(0) \
INIT_CALLS_LEVEL(1) \
INIT_CALLS_LEVEL(2) \
INIT_CALLS_LEVEL(3) \
INIT_CALLS_LEVEL(4) \
INIT_CALLS_LEVEL(5) \
INIT_CALLS_LEVEL(rootfs) \
INIT_CALLS_LEVEL(6) \
INIT_CALLS_LEVEL(7) \
__initcall_end = .;
#define INIT_CALLS_LEVEL(level) \
__initcall##level##_start = .; \
KEEP(*(.initcall##level##.init)) \
KEEP(*(.initcall##level##s.init)) \
early_initcall(init_real_mode);
#define early_initcall(fn) __define_initcall(fn, early)
#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)
#define ___define_initcall(fn, id, __sec) \
static initcall_t __initcall_##fn##id __used \
__attribute__((__section__(#__sec ".init"))) = fn;
static initcall_t __initcall_init_real_modeearly __used \
__attribute__((__section__(".initcallearly.init"))) = init_real_mode;
real_mode_header
// 在汇编中定义real_mode_header
GLOBAL(real_mode_header)
.long pa_text_start
.long pa_ro_end
/* SMP trampoline */
.long pa_trampoline_start
.long pa_trampoline_status
.long pa_trampoline_header
.long pa_trampoline_pgd;
/* ACPI S3 wakeup */
.long pa_wakeup_start
.long pa_wakeup_header
/* APM/BIOS reboot */
.long pa_machine_real_restart_asm
.long __KERNEL32_CS
END(real_mode_header)
struct real_mode_header {
u32 text_start;
u32 ro_end;
/* SMP trampoline */
u32 trampoline_start;
u32 trampoline_status;
u32 trampoline_header;
u32 trampoline_pgd;
/* ACPI S3 wakeup */
u32 wakeup_start;
u32 wakeup_header;
/* APM/BIOS reboot */
u32 machine_real_restart_asm;
u32 machine_real_restart_seg;
};
// 在C中声明real_mode_header
extern struct real_mode_header *real_mode_header;
trampoline_header
// 在汇编中定义trampoline_header
GLOBAL(trampoline_header)
tr_start: .space 8
GLOBAL(tr_efer) .space 8
GLOBAL(tr_cr4) .space 4
GLOBAL(tr_flags) .space 4
END(trampoline_header)
struct trampoline_header {
u64 start;
u64 efer;
u32 cr4;
u32 flags;
};
trampoline_pgd
// 在汇编中定义trampoline_pgd
GLOBAL(trampoline_pgd) .space PAGE_SIZE
trampoline_start
// 在汇编中定义trampoline_start
ENTRY(trampoline_start)
secondary_startup_64
// 在汇编中定义secondary_startup_64
ENTRY(secondary_startup_64)
// 在C中声明secondary_startup_64
extern unsigned char secondary_startup_64[];
pasyms.h
pasyms.h在编译过程中产生,pa_XXX = XXX;
在realmode.lds.S中include pasyms.h
realmode-y += header.o
realmode-y += trampoline_$(BITS).o
realmode-y += stack.o
realmode-y += reboot.o
realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs)
REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' // pa_XXX = XXX;
quiet_cmd_pasyms = PASYMS $@
cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
sed $(sed-pasyms) | sort | uniq > $@
$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
$(call if_changed,pasyms)
kthreadd
内核线程,pid=2,负责创建和管理内核线程,是所有内核线程的父线程
ps aux|grep kthreadd
更多推荐


所有评论(0)