多处理器系统

Linux内核启动在单处理器和多处理器系统上有显著区别:单处理器系统只有一个CPU,整个启动过程都在这个CPU上完成;多处理器系统有多个CPU,intel将这些CPU划分为两类:BSP(BootStrap Processor,主处理器,只有一个)和AP(Application Processor,从处理器,有多个),内核先在BSP上启动,再由BSP通过IPI通知AP启动,由于BSP已经完成大部分启动工作,因此AP不需要从头开始,只需要执行少部分启动工作

计算机引导过程四(vmlinuz保护模式vmlinux.bin)中提到解压后内核起始地址是arch/x86/kernel/head_64.S中的startup_64,在startup_64中有段代码体现了两者的差异

	/* Form the CR3 value being sure to include the CR3 modifier */
	addq	$(early_top_pgt - __START_KERNEL_map), %rax // rax = early_top_pgt物理编译地址
	jmp 1f                  // BSP:跳转到1f,startup_64 -> initial_code(对于BSP,是x86_64_start_kernel的地址)
ENTRY(secondary_startup_64) // AP:secondary_startup_64 -> initial_code(对于AP,是start_secondary的地址)
	...
	/*
	 * Retrieve the modifier (SME encryption mask if SME is active) to be
	 * added to the initial pgdir entry that will be programmed into CR3.
	 */
	pushq	%rsi
	call	__startup_secondary_64 // 调用__startup_secondary_64获取加密掩码,返回值rax = 加密掩码(一般为0)
	popq	%rsi

	/* Form the CR3 value being sure to include the CR3 modifier */
	addq	$(init_top_pgt - __START_KERNEL_map), %rax // rax = init_top_pgt物理编译地址
1:

可以看到BSP执行完secondary_startup_64之前的部分后直接跳转到1f,跳过了从secondary_startup_64到1f之间的部分,而AP从secondary_startup_64开始执行,从1f开始,两者执行代码完全一样

BSP调用栈

BSP调用栈比较清楚,startup_64 -> x86_64_start_kernel

// arch/x86/kernel/head_64.S
startup_64 ->
	initial_code -> // x86_64_start_kernel
		reset_early_page_tables(); // 清空early_top_pgt,保留kernel映射区
		clear_page(init_top_pgt); // 清空init_top_pgt
		init_top_pgt[511] = early_top_pgt[511]; // 保留kernel映射区
		x86_64_start_reservations ->
			start_kernel

AP调用栈

AP调用栈有点复杂,是在kernel_init中确定的,这里我们先列出来,等到kernel_init再详细解释

// arch/x86/realmode/rm/trampoline_64.S
trampoline_start ->
	startup_32 ->
		movl	$pa_trampoline_pgd, %eax
		movl	%eax, %cr3 // cr3 = pa_trampoline_pgd = trampoline_pgd = init_top_pgt
		startup_64 ->
			jmpq	*tr_start(%rip) // 跳转到tr_start = secondary_startup_64

// arch/x86/kernel/head_64.S
secondary_startup_64 ->
	initial_code -> // start_secondary
		cpu_startup_entry ->
			while (1)
				do_idle();

start_kernel

内核进程,pid=0,一般称为0号/idle/swapper进程,是systemd和kthreadd的父进程

start_kernel ->
	setup_arch ->
		early_alloc_pgt_buf

		e820__memblock_setup -> // 将e820_table转换为memblock
			memblock_add // 将一块物理内存区域加入到memblock.memory

		init_mem_mapping ->
			init_memory_mapping(0, ISA_END_ADDRESS); // 创建1MB以下的direct mapping映射区
			memory_map_top_down(ISA_END_ADDRESS, end); -> // 创建1MB以上的direct mapping映射区
				init_range_memory_mapping ->
					init_memory_mapping
			load_cr3(swapper_pg_dir); // #define swapper_pg_dir init_top_pgt
			__flush_tlb_all(); // 刷新tlb

		x86_init.paging.pagetable_init(); -> // native_pagetable_init
			paging_init ->	
				sparse_init // SPARSEMEM模型初始化

	arch_call_rest_init ->
		rest_init ->
			// 创建内核线程kernel_init
			pid = kernel_thread(kernel_init, NULL, CLONE_FS);
			// 创建内核线程kthreadd
			pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
			cpu_startup_entry ->
				while (1)
					do_idle();

early_alloc_pgt_buf

void  __init early_alloc_pgt_buf(void)
{
	unsigned long tables = INIT_PGT_BUF_SIZE;
	phys_addr_t base;

	base = __pa(extend_brk(tables, PAGE_SIZE));

	pgt_buf_start = base >> PAGE_SHIFT;
	pgt_buf_end = pgt_buf_start;
	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
}
void * __init extend_brk(size_t size, size_t align)
{
	size_t mask = align - 1;
	void *ret;

	BUG_ON(_brk_start == 0);
	BUG_ON(align & mask);

	_brk_end = (_brk_end + mask) & ~mask;
	BUG_ON((char *)(_brk_end + size) > __brk_limit);

	ret = (void *)_brk_end;	// unsigned long _brk_end = (unsigned long)__brk_base;
	_brk_end += size;		// 使用大小为size的空间

	memset(ret, 0, size);

	return ret;
}

调用RESERVE_BRK在.brk_reservation中保留大小为INIT_PGT_BUF_SIZE的空间

RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);

RESERVE_BRK定义了一个函数,函数功能是在.brk_reservation中保留大小为sz的空间

#define RESERVE_BRK(name,sz)						\
	static void __section(.discard.text) __used notrace		\
	__brk_reservation_fn_##name##__(void) {				\
		asm volatile (						\
			".pushsection .brk_reservation,\"aw\",@nobits;" \
			".brk." #name ":"				\
			" 1:.skip %c0;"					\
			" .size .brk." #name ", . - 1b;"		\
			" .popsection"					\
			: : "i" (sz));					\
	}

.brk_reservation位于.brk

// arch/x86/kernel/vmlinux.lds.S
. = ALIGN(PAGE_SIZE);
.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
	__brk_base = .;
	. += 64 * 1024;		/* 64k alignment slop space */ // 在__brk_base后预留64KB空间
	*(.brk_reservation)	/* areas brk users have reserved */ // 将.brk_reservation放入预留空间
	__brk_limit = .;
}

kernel_init

内核线程,启动用户进程systemd,启动后退出
systemd是第一个用户进程,pid=1,是所有用户进程的父进程
ps aux|grep systemd

kernel_init ->
	kernel_init_freeable ->
		do_pre_smp_initcalls ->
			init_real_mode ->
				setup_real_mode ->
					// real_mode_header->trampoline_header保存的是pa_trampoline_header = trampoline_header(在汇编中定义)的地址
					trampoline_header = (struct trampoline_header *) __va(real_mode_header->trampoline_header);
					// tr_start = secondary_startup_64
					trampoline_header->start = (u64) secondary_startup_64;
					// real_mode_header->trampoline_pgd保存的是pa_trampoline_pgd = trampoline_pgd(在汇编中定义)的地址
					trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
					// trampoline_pgd = init_top_pgt
					trampoline_pgd[511] = init_top_pgt[511].pgd;
		smp_init ->
			cpu_up ->
				do_cpu_up ->
					_cpu_up ->
						cpuhp_up_callbacks ->
							cpuhp_invoke_callback ->
								cb = bringup ? step->startup.single : step->teardown.single;
								ret = cb(cpu); -> // bringup_cpu
									__cpu_up ->
										smp_ops.cpu_up(cpu, tidle); -> // native_cpu_up
											do_boot_cpu ->
												// real_mode_header->trampoline_start保存的是pa_trampoline_start = trampoline_start(在汇编中定义)的地址
												// start_ip = trampoline_start
												unsigned long start_ip = real_mode_header->trampoline_start;
												initial_code = (unsigned long)start_secondary; // AP的initial_code为start_secondary
												wakeup_cpu_via_init_nmi ->
													wakeup_secondary_cpu_via_init ->
														// 通过IPI通知AP从start_eip = trampoline_start开始执行
														apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
	try_to_run_init_process("/sbin/init") // /usr/lib/systemd/systemd
		run_init_process
			do_execve

do_pre_smp_initcalls

在do_pre_smp_initcalls中调用init_real_mode

static void __init do_pre_smp_initcalls(void)
{
	for (fn = __initcall_start; fn < __initcall0_start; fn++)
		do_one_initcall(initcall_from_entry(fn)); // 执行.initcallearly.init
}

extern initcall_entry_t __initcall_start[];
extern initcall_entry_t __initcall0_start[];

typedef int (*initcall_t)(void); // 函数指针

typedef initcall_t initcall_entry_t;

static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
{
	return *entry;
}

int __init_or_module do_one_initcall(initcall_t fn)
{
	ret = fn();
}
#define INIT_CALLS							\
		__initcall_start = .;					\
		KEEP(*(.initcallearly.init))				\
		INIT_CALLS_LEVEL(0)					\
		INIT_CALLS_LEVEL(1)					\
		INIT_CALLS_LEVEL(2)					\
		INIT_CALLS_LEVEL(3)					\
		INIT_CALLS_LEVEL(4)					\
		INIT_CALLS_LEVEL(5)					\
		INIT_CALLS_LEVEL(rootfs)				\
		INIT_CALLS_LEVEL(6)					\
		INIT_CALLS_LEVEL(7)					\
		__initcall_end = .;

#define INIT_CALLS_LEVEL(level)						\
		__initcall##level##_start = .;				\
		KEEP(*(.initcall##level##.init))			\
		KEEP(*(.initcall##level##s.init))			\
early_initcall(init_real_mode);

#define early_initcall(fn)		__define_initcall(fn, early)

#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)

#define ___define_initcall(fn, id, __sec) \
	static initcall_t __initcall_##fn##id __used \
		__attribute__((__section__(#__sec ".init"))) = fn;

static initcall_t __initcall_init_real_modeearly __used \
		__attribute__((__section__(".initcallearly.init"))) = init_real_mode;

real_mode_header

// 在汇编中定义real_mode_header
GLOBAL(real_mode_header)
	.long	pa_text_start
	.long	pa_ro_end
	/* SMP trampoline */
	.long	pa_trampoline_start
	.long	pa_trampoline_status
	.long	pa_trampoline_header
	.long	pa_trampoline_pgd;
	/* ACPI S3 wakeup */
	.long	pa_wakeup_start
	.long	pa_wakeup_header
	/* APM/BIOS reboot */
	.long	pa_machine_real_restart_asm
	.long	__KERNEL32_CS
END(real_mode_header)

struct real_mode_header {
	u32	text_start;
	u32	ro_end;
	/* SMP trampoline */
	u32	trampoline_start;
	u32	trampoline_status;
	u32	trampoline_header;
	u32	trampoline_pgd;
	/* ACPI S3 wakeup */
	u32	wakeup_start;
	u32	wakeup_header;
	/* APM/BIOS reboot */
	u32	machine_real_restart_asm;
	u32	machine_real_restart_seg;
};

// 在C中声明real_mode_header
extern struct real_mode_header *real_mode_header;

trampoline_header

// 在汇编中定义trampoline_header
GLOBAL(trampoline_header)
	tr_start:		.space	8
	GLOBAL(tr_efer)		.space	8
	GLOBAL(tr_cr4)		.space	4
	GLOBAL(tr_flags)	.space	4
END(trampoline_header)

struct trampoline_header {
	u64 start;
	u64 efer;
	u32 cr4;
	u32 flags;
};

trampoline_pgd

// 在汇编中定义trampoline_pgd
GLOBAL(trampoline_pgd)		.space	PAGE_SIZE

trampoline_start

// 在汇编中定义trampoline_start
ENTRY(trampoline_start)

secondary_startup_64

// 在汇编中定义secondary_startup_64
ENTRY(secondary_startup_64)

// 在C中声明secondary_startup_64
extern unsigned char secondary_startup_64[];

pasyms.h

pasyms.h在编译过程中产生,pa_XXX = XXX;
在realmode.lds.S中include pasyms.h

realmode-y			+= header.o
realmode-y			+= trampoline_$(BITS).o
realmode-y			+= stack.o
realmode-y			+= reboot.o
realmode-$(CONFIG_ACPI_SLEEP)	+= $(wakeup-objs)

REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))

sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' // pa_XXX = XXX;

quiet_cmd_pasyms = PASYMS  $@
      cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
		   sed $(sed-pasyms) | sort | uniq > $@

$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
	$(call if_changed,pasyms)

kthreadd

内核线程,pid=2,负责创建和管理内核线程,是所有内核线程的父线程
ps aux|grep kthreadd

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐