内核生命周期
uboot 打印完 Starting kernel . . .
,就完成了自己的使命,控制权便交给了 kernel 的第一条指令,也就是下面这个函数
init/main.c
asmlinkage __visible void __init start_kernel(void)
{
...
rest_init();
}
start_kernel 相当于内核的 main 函数,内核的生命周期就是从执行这个函数的第一条语句开始的,直到最后一个函数 reset_init()
,内核将不再从这个函数中返回,而是陷入这个函数里面的一个 while(1) 死循环,这个死循环被作为 idle 进程,也就是 0 号进程。
所以,内核的生命周期,就是一个完整的 start_kernel 函数。始于 start_kernel 函数的第一条语句,停留在最后的死循环。
init 进程
kernel 会创建众多内核线程,来持续致力于内存、磁盘、CPU 的管理,其中有两个内核线程比较重要,需要我们重点讲解,那就是 1 号内核线程 kernel_init 和 2 号内核线程 kthreadd。1 号内核线程最终会被用户的第一个进程 init 代替,也就成了 1 号进程。如下:
# ps
PID USER COMMAND
1 root init
2 root [kthreadd]
3 root [rcu_gp]
4 root [rcu_par_gp]
7 root [kworker/u4:0-ev]
8 root [mm_percpu_wq]
9 root [ksoftirqd/0]
...
COMMAND 这一列,带中括号的是内核线程,不带中括号的是用户进程。从 PID 统一编址就可以看出,它俩地位是一样的。
下面我们深入分析一下从 start_kernel 到最终运行 init 进程,kernel 都经历了什么
打印
添加打印,是分析流程的好方法。
asmlinkage __visible void __init start_kernel(void)
{
char *command_line;
char *after_dashes;
set_task_stack_end_magic(&init_task);
smp_setup_processor_id();
debug_objects_early_init();
cgroup_init_early();
local_irq_disable();
early_boot_irqs_disabled = true;
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them.
*/
boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
/*
* Set up the the initial canary and entropy after arch
* and after adding latent and command line entropy.
*/
add_latent_entropy();
add_device_randomness(command_line, strlen(command_line));
boot_init_stack_canary();
mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();
build_all_zonelists(NULL);
page_alloc_init();
pr_notice("Kernel command line: %s\n", boot_command_line);
parse_early_param();
after_dashes = parse_args("Booting kernel",
static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, NULL, &unknown_bootoption);
if (!IS_ERR_OR_NULL(after_dashes))
parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
NULL, set_init_arg);
jump_label_init();
/*
* These use large bootmem allocations and must precede
* kmem_cache_init()
*/
setup_log_buf(0);
vfs_caches_init_early();
sort_main_extable();
trap_init();
mm_init();
ftrace_init();
/* trace_printk can be enabled here */
early_trace_init();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
radix_tree_init();
/*
* Set up housekeeping before setting up workqueues to allow the unbound
* workqueue to take non-housekeeping into account.
*/
housekeeping_init();
/*
* Allow workqueue creation and work item queueing/cancelling
* early. Work item execution depends on kthreads and starts after
* workqueue_init().
*/
workqueue_init_early();
rcu_init();
/* Trace events are available after this */
trace_init();
if (initcall_debug)
initcall_debug_enable();
context_tracking_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
tick_init();
rcu_init_nohz();
init_timers();
hrtimers_init();
softirq_init();
timekeeping_init();
time_init();
sched_clock_postinit();
printk_safe_init();
perf_event_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
local_irq_enable();
kmem_cache_init_late();
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
printk("## start_kernel() --> console_init()\n");
if (panic_later)
panic("Too many boot %s vars at `%s'", panic_later,
panic_param);
lockdep_info();
/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();
/*
* This needs to be called before any devices perform DMA
* operations that might use the SWIOTLB bounce buffers. It will
* mark the bounce buffers as decrypted so that their usage will
* not cause "plain-text" data to be decrypted when accessed.
*/
mem_encrypt_init();
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void _)initrd_start)) < min_low_pfn) {
pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
page_to_pfn(virt_to_page((void _)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
page_ext_init();
kmemleak_init();
debug_objects_mem_init();
setup_per_cpu_pageset();
numa_policy_init();
acpi_early_init();
if (late_time_init)
late_time_init();
calibrate_delay();
pid_idr_init();
anon_vma_init();
#ifdef CONFIG_X86
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
thread_stac