7. linux/init/main.c

I felt guilty writing this chapter as there are too many documents about it, if not more than enough. start_kernel() supporting functions are changed from version to version, as they depend on OS component internals, which are being improved all the time. I may not have the time for frequent document updates, so I decided to keep this chapter as simple as possible.

7.1. start_kernel()

///////////////////////////////////////////////////////////////////////////////
asmlinkage void __init start_kernel(void)
{
        char * command_line;
        extern char saved_command_line[];
/*
 * Interrupts are still disabled. Do necessary setups, then enable them
 */
        lock_kernel();
        printk(linux_banner);

        /* Memory Management in Linux, esp. for setup_arch()
         * Linux-2.4.4 MM Initialization */
        setup_arch(&command_line);
        printk("Kernel command line: %s\n", saved_command_line);

        /* linux/Documentation/kernel-parameters.txt
         * The Linux BootPrompt-HowTo */
        parse_options(command_line);

        trap_init() {
#ifdef CONFIG_EISA
                if (isa_readl(0x0FFFD9) == 'E'+('I'<<8)+('S'<<16)+('A'<<24))
                        EISA_bus = 1;
#endif
#ifdef CONFIG_X86_LOCAL_APIC
                init_apic_mappings();
#endif
                set_xxxx_gate(x, &func);    // setup gates
                cpu_init();
        }
        init_IRQ();
        sched_init();
        softirq_init() {
                for (int i=0; i<32: i++)
                        tasklet_init(bh_task_vec+i, bh_action, i);
                open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
                open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
        }
        time_init();

        /*
         * HACK ALERT! This is early. We're enabling the console before
         * we've done PCI setups etc, and console_init() must be aware of
         * this. But we do want output early, in case something goes wrong.
         */
        console_init();
#ifdef CONFIG_MODULES
        init_modules();
#endif
        if (prof_shift) {
                unsigned int size;
                /* only text is profiled */
                prof_len = (unsigned long) &_etext - (unsigned long) &_stext;
                prof_len >>= prof_shift;
                size = prof_len * sizeof(unsigned int) + PAGE_SIZE-1;
                prof_buffer = (unsigned int *) alloc_bootmem(size);
        }

        kmem_cache_init();
        sti();

        // BogoMips mini-Howto
        calibrate_delay();

        // linux/Documentation/initrd.txt
#ifdef CONFIG_BLK_DEV_INITRD
        if (initrd_start && !initrd_below_start_ok &&
                        initrd_start < min_low_pfn << PAGE_SHIFT) {
                printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
                    "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
                initrd_start = 0;
        }
#endif

        mem_init();
        kmem_cache_sizes_init();
        pgtable_cache_init();

        /*
         * For architectures that have highmem, num_mappedpages represents
         * the amount of memory the kernel can use.  For other architectures
         * it's the same as the total pages.  We need both numbers because
         * some subsystems need to initialize based on how much memory the
         * kernel can use.
         */
        if (num_mappedpages == 0)
                num_mappedpages =  num_physpages;

        fork_init(num_mempages);
        proc_caches_init();
        vfs_caches_init(num_physpages);
        buffer_init(num_physpages);
        page_cache_init(num_physpages);
#if defined(CONFIG_ARCH_S390)
        ccwcache_init();
#endif
        signals_init();
#ifdef CONFIG_PROC_FS
        proc_root_init();
#endif
#if defined(CONFIG_SYSVIPC)
        ipc_init();
#endif
        check_bugs();
        printk("POSIX conformance testing by UNIFIX\n");

        /*
         *      We count on the initial thread going ok
         *      Like idlers init is an unlocked kernel thread, which will
         *      make syscalls (and thus be locked).
         */
        smp_init() {
#ifndef CONFIG_SMP
#     ifdef CONFIG_X86_LOCAL_APIC
                APIC_init_uniprocessor();
#     else
                do { } while (0);
#     endif
#else
                /* Check Section 8.2. */
#endif
        }

        rest_init() {
                // init process, pid = 1
                kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
                unlock_kernel();
                current->need_resched = 1;
                // idle process, pid = 0
                cpu_idle();     // never return
        }
}
start_kernel() calls rest_init() to spawn an "init" process and become "idle" process itself.

7.2. init()

"Init" process:
///////////////////////////////////////////////////////////////////////////////
static int init(void * unused)
{
        lock_kernel();
        do_basic_setup();

        prepare_namespace();

        /*
         * Ok, we have completed the initial bootup, and
         * we're essentially up and running. Get rid of the
         * initmem segments and start the user-mode stuff..
         */
        free_initmem();
        unlock_kernel();

        if (open("/dev/console", O_RDWR, 0) < 0)        // stdin
                printk("Warning: unable to open an initial console.\n");

        (void) dup(0);                                  // stdout
        (void) dup(0);                                  // stderr

        /*
         * We try each of these until one succeeds.
         *
         * The Bourne shell can be used instead of init if we are
         * trying to recover a really broken machine.
         */

        if (execute_command)
                execve(execute_command,argv_init,envp_init);
        execve("/sbin/init",argv_init,envp_init);
        execve("/etc/init",argv_init,envp_init);
        execve("/bin/init",argv_init,envp_init);
        execve("/bin/sh",argv_init,envp_init);
        panic("No init found.  Try passing init= option to kernel.");
}
Refer to "man init" or SysVinit for further information on user-mode "init" process.

7.3. cpu_idle()

"Idle" process:
/*
 * The idle thread. There's no useful work to be
 * done, so just try to conserve power and have a
 * low exit latency (ie sit in a loop waiting for
 * somebody to say that they'd like to reschedule)
 */
void cpu_idle (void)
{
        /* endless idle loop with no priority at all */
        init_idle();
        current->nice = 20;
        current->counter = -100;

        while (1) {
                void (*idle)(void) = pm_idle;
                if (!idle)
                        idle = default_idle;
                while (!current->need_resched)
                        idle();
                schedule();
                check_pgt_cache();
        }
}

///////////////////////////////////////////////////////////////////////////////
void __init init_idle(void)
{
        struct schedule_data * sched_data;
        sched_data = &aligned_data[smp_processor_id()].schedule_data;

        if (current != &init_task && task_on_runqueue(current)) {
                printk("UGH! (%d:%d) was on the runqueue, removing.\n",
                        smp_processor_id(), current->pid);
                del_from_runqueue(current);
        }
        sched_data->curr = current;
        sched_data->last_schedule = get_cycles();
        clear_bit(current->processor, &wait_init_idle);
}

///////////////////////////////////////////////////////////////////////////////
void default_idle(void)
{
        if (current_cpu_data.hlt_works_ok && !hlt_counter) {
                __cli();
                if (!current->need_resched)
                        safe_halt();
                else
                        __sti();
        }
}

/* defined in linux/include/asm-i386/system.h */
#define __cli()                 __asm__ __volatile__("cli": : :"memory")
#define __sti()                 __asm__ __volatile__("sti": : :"memory")

/* used in the idle loop; sti takes one instruction cycle to complete */
#define safe_halt()             __asm__ __volatile__("sti; hlt": : :"memory")
CPU will resume code execution with the instruction following "hlt" on the return from an interrupt handler.

7.4. Reference