多核调度

调度的基本api

调度队列初始化

十分简单就直接放源码了

int rr_sched_init(void)
{
        int i = 0;

        /* Initialize global variables */
        for (i = 0; i < PLAT_CPU_NUM; i++) {
                init_list_head(&(rr_ready_queue_meta[i].queue_head));
                lock_init(&(rr_ready_queue_meta[i].queue_lock));
                rr_ready_queue_meta[i].queue_len = 0;
        }

        return 0;
}

调度队列入队

内核初始化过程结束之后会调用create_root_thread来创建第一个用户态进程及线程，在create_root_thread最后会调用sched_enqueue函数将创建的线程加入调度队列之中。sched_enqueue 最终会调用kernel/sched/policy_rr.c中定义的rr_sched_enqueue函数。该函数首先挑选合适的CPU核心的就绪队列（考虑线程是否绑核以及各个CPU核心之间的负载均衡），然后调用__rr_sched_enqueue将线程插入到选中的就绪队列中。

挑选合适的就绪队列在rr策略之中是一个简单的算法：如果当前队列长度没有达到负载均衡的阈值，就选择当前的cpu，如果达到阈值了，就选择任务数量最少的cpu，并增加队列长度

这里的阈值可调节，但还是属于非常简单的策略，同时这里对于每个cpu的meta data也是属于一把大锁保平安的场景，并没有进行什么优化

又因为是rr, 所以优先级并没有起什么作用，简单判断后就直接list append

int __rr_sched_enqueue(struct thread *thread, int cpuid)
{
        if (thread->thread_ctx->type == TYPE_IDLE) {
                return 0;
        }

        /* Already in the ready queue */
        if (thread_is_ts_ready(thread)) {
                return -EINVAL;
        }

        thread->thread_ctx->cpuid = cpuid;
        thread_set_ts_ready(thread);
        obj_ref(thread);
        list_append(&(thread->ready_queue_node),
                    &(rr_ready_queue_meta[cpuid].queue_head));
        rr_ready_queue_meta[cpuid].queue_len++;

        return 0;
}

同理，出队操作也就是很简单的api

/*
 * remove @thread from its current residual ready queue
 */
int rr_sched_dequeue(struct thread *thread)
{
        BUG_ON(!thread);
        BUG_ON(!thread->thread_ctx);
        /* IDLE thread will **not** be in any ready queue */
        BUG_ON(thread->thread_ctx->type == TYPE_IDLE);

        unsigned int cpuid = 0;
        int ret = 0;

        cpuid = thread->thread_ctx->cpuid;
        lock(&(rr_ready_queue_meta[cpuid].queue_lock));
        ret = __rr_sched_dequeue(thread);
        unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
        return ret;
}

选择可调度的线程比较复杂一些，由于涉及到了fast path优化来尽量减少锁开销

具体而言，在选取一个线程之前，先做了一次判空，如果准备队列是空就直接返回idle线程

不空的情况下再尝试拿锁，减少轻工作负载下的锁竞争

拿锁后遍历自己核心上的ready queue，找一个当前状态为free的或者就是当前的thread，如果没找到则返回idle

（按照注释的说法这个好像和多核下线程在不同核心上的调度有关，虽然线程在cpu核心之间可以通过负载均衡策略进行分配，但是线程相关的内核上下文（如内核栈）还需要时间迁移和释放（例如正在执行系统调用和中断），所以会出现ready queue没一个free的，是因为在fast path 判断到拿锁的时候被中断等改变状态了吗）

/*
 * Choose an appropriate thread and dequeue from ready queue
 */
struct thread *rr_sched_choose_thread(void)
{
        unsigned int cpuid = smp_get_cpu_id();
        struct thread *thread = NULL;

        if (!list_empty(&(rr_ready_queue_meta[cpuid].queue_head))) {
                lock(&(rr_ready_queue_meta[cpuid].queue_lock));
        again:
                if (list_empty(&(rr_ready_queue_meta[cpuid].queue_head))) {
                        unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
                        goto out;
                }
                /*
                 * When the thread is just moved from another cpu and
                 * the kernel stack is used by the origina core, try
                 * to find another thread.
                 */
                if (!(thread = find_runnable_thread(
                              &(rr_ready_queue_meta[cpuid].queue_head)))) {
                        unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
                        goto out;
                }

                BUG_ON(__rr_sched_dequeue(thread));
                if (thread_is_exiting(thread) || thread_is_exited(thread)) {
                        /* Thread need to exit. Set the state to TE_EXITED */
                        thread_set_exited(thread);
                        goto again;
                }
                unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
                return thread;
        }
out:
        return &idle_threads[cpuid];
}

/*
 * find_runnable_thread
 * ** Shoule hold a dedicated lock for the thread_list and this function can
 * only be called in the critical section!! ** Only the thread which kernel
 * state is free can be choosed
 */
struct thread *find_runnable_thread(struct list_head *thread_list)
{
        struct thread *thread;

        for_each_in_list (
                thread, struct thread, ready_queue_node, thread_list) {
                if (!thread_is_suspend(thread)
                    && (thread->thread_ctx->kernel_stack_state == KS_FREE
                        || thread == current_thread)) {
                        return thread;
                }
        }
        return NULL;
}

至于rr_sched本身则是如下的逻辑

如果没有old, 就choose one

如果有old, 查看old的类型，有些ipc相关的thread type需要给调度的新线程传递context

查看线程自己的时间片( sc->budget,sc是schedular context) 是否耗尽，如果没耗尽就继续

耗尽则切换到新的线程

切换的过程本身是很简单的，毕竟此时还没有返回给用户态，只是修改current_thread相关的一些元数据，并在新的线程上设置prev_thread

(返回用户态的实例在sched_to_thread，内部比较复杂，可能跨核调度，此时需要通过ipi（体系结构特定的处理器间中断）来等待目标cpu核处理好当前的中断等事件释放内核栈，再进行体系结构特定的上下文切换（寄存器的save/load）和用户态返回)

/*
 * Schedule a thread to execute.
 * current_thread can be NULL, or the state is TS_RUNNING or
 * TS_WAITING/TS_BLOCKING. This function will suspend current running thread, if
 * any, and schedule another thread from
 * `(rr_ready_queue_meta[cpuid].queue_head)`.
 * ***the following text might be outdated***
 * 1. Choose an appropriate thread through calling *chooseThread* (Simple
 * Priority-Based Policy)
 * 2. Update current running thread and left the caller to restore the executing
 * context
 */
int rr_sched(void)
{
        /* WITH IRQ Disabled */
        struct thread *old = current_thread;
        struct thread *new = 0;

        if (old) {
                BUG_ON(!old->thread_ctx);

                /* old thread may pass its scheduling context to others. */
                if (old->thread_ctx->type != TYPE_SHADOW
                    && old->thread_ctx->type != TYPE_REGISTER) {
                        BUG_ON(!old->thread_ctx->sc);
                }

                /* Set TE_EXITING after check won't cause any trouble, the
                 * thread will be recycle afterwards. Just a fast path. */
                /* Check whether the thread is going to exit */
                if (thread_is_exiting(old)) {
                        /* Set the state to TE_EXITED */
                        thread_set_exited(old);
                }

                /* check old state */
                if (!thread_is_exited(old)) {
                        if (thread_is_ts_running(old)) {
                                /* A thread without SC should not be TS_RUNNING.
                                 */
                                BUG_ON(!old->thread_ctx->sc);
                                if (old->thread_ctx->sc->budget != 0
                                    && !thread_is_suspend(old)) {
                                        switch_to_thread(old);
                                        return 0; /* no schedule needed */
                                }
                                rr_sched_refill_budget(old, DEFAULT_BUDGET);
                                BUG_ON(rr_sched_enqueue(old) != 0);
                        } else if (!thread_is_ts_blocking(old)
                                   && !thread_is_ts_waiting(old)) {
                                kinfo("thread state: %d\n",
                                      old->thread_ctx->state);
                                BUG_ON(1);
                        }
                }
        }

        BUG_ON(!(new = rr_sched_choose_thread()));
        switch_to_thread(new);

        return 0;
}

协作式调度

有了上文的api之后非常简单！

实现协作式调度只需要让用户态能够发起放弃线程的情况，也就是yield的syscall就好了

内核里面就简单的将他的时间片清空，用sched（这是policy类的基类的“虚函数”，此处会被“重载”为rr_sched）切换线程，再eret就好了

/* SYSCALL functions */
void sys_yield(void)
{
        current_thread->thread_ctx->sc->budget = 0;
        sched();
        eret_to_thread(switch_context());
}

其实switch_context里面还是比较复杂的，需要切换很多东西

vmspace 这个结构在thread里面(还记得这的核心数据机构是一个rbtree的root吗，回看内存管理)，但是需要切换页表（页表的地址也在vmspace之中维护）
tls(thread local storage，在arm架构的典型实现之中是TPIDR_EL0寄存器，它存着一个线程特定的标识符)
- anyway，OS其实不知道这里面存的是什么东西，他只是把这个寄存器当成线程特定的标识符，并在自己的线程实现之中维护而已。至于语言层面的tls如何实现，那是编译器开发者或者库开发者的事情，例如存一个特定的空间的指针（例如小的在栈上，大的在堆上）
- arm compiler的支持文档 https://developer.arm.com/documentation/dui0205/g/arm-compiler-reference/compiler-specific-features/thread-local-storage
- 对tls的一些讨论 https://forum.osdev.org/viewtopic.php?t=36597
fpu 相关
其他想要添加的机制，例如保存和清理TLB的一些数据（history） …
把切换的线程相关的寄存器设置到cpu上

具体代码如下

/*
 * This function is used after current_thread is set (a new thread needs to be
 * scheduled).
 *
 * Switch context between current_thread and current_thread->prev_thread:
 * including: vmspace, fpu, tls, ...
 *
 * Return the context pointer which should be set to stack pointer register.
 */
vaddr_t switch_context(void)
{
        struct thread *target_thread;
        struct thread_ctx *target_ctx;
        struct thread *prev_thread;

        target_thread = current_thread;
        if (!target_thread || !target_thread->thread_ctx) {
                kwarn("%s no thread_ctx", __func__);
                return 0;
        }

        target_ctx = target_thread->thread_ctx;

        prev_thread = target_thread->prev_thread;
        if (prev_thread == THREAD_ITSELF)
                return (vaddr_t)target_ctx;

#if FPU_SAVING_MODE == EAGER_FPU_MODE
        save_fpu_state(prev_thread);
        restore_fpu_state(target_thread);
#else
        /* FPU_SAVING_MODE == LAZY_FPU_MODE */
        if (target_thread->thread_ctx->type > TYPE_KERNEL)
                disable_fpu_usage();
#endif

        /* Switch the TLS information: save and restore */
        switch_tls_info(prev_thread, target_thread);

#ifndef CHCORE_KERNEL_TEST
        BUG_ON(!target_thread->vmspace);
        /*
         * Recording the CPU the thread runs on: for TLB maintainence.
         * switch_context is always required for running a (new) thread.
         * So, we invoke record_running_cpu here.
         */
        record_history_cpu(target_thread->vmspace, smp_get_cpu_id());
        if ((!prev_thread) || (prev_thread->vmspace != target_thread->vmspace))
                switch_thread_vmspace_to(target_thread);
#else /* CHCORE_KERNEL_TEST */
        /* TYPE_TESTS threads do not have vmspace. */
        if (target_thread->thread_ctx->type != TYPE_TESTS) {
                BUG_ON(!target_thread->vmspace);
                record_history_cpu(target_thread->vmspace, smp_get_cpu_id());
                switch_thread_vmspace_to(target_thread);
        }
#endif /* CHCORE_KERNEL_TEST */

        arch_switch_context(target_thread);

        return (vaddr_t)target_ctx;
}

void arch_switch_context(struct thread *target)
{
	struct per_cpu_info *info;

	info = get_per_cpu_info();

	/* Set the `cur_exec_ctx` in the per_cpu info. */
	info->cur_exec_ctx = (u64)target->thread_ctx;
}

抢占式调度

抢占式首先要支持的就是时钟中断

而时钟中断的支持实际上和其他外设也没什么区别，抽象起来就是对寄存器的读和写，以及配置真正连接cpu引脚的那个部分到底什么时候发信号

引用文档

Arm Generic Timer 用到的寄存器

CNTPCT_EL0: 它的值代表了当前的 system count。

CNTFRQ_EL0: 它的值代表了物理时钟运行的频率，即每秒钟 system count 会增加多少。

CNTP_CVAL_EL0: 是一个64位寄存器，操作系统可以向该寄存器写入一个值，当 system count 达到或超过该值时，物理时钟会触发中断。

CNTP_TVAL_EL0: 是一个32位寄存器，操作系统可以写入 TVAL，处理器会在内部读取当前的系统计数，加上写入的值，然后填充 CVAL。

CNTP_CTL_EL0: 物理时钟的控制寄存器，第0位ENABLE控制时钟是否开启，1代表enble，0代表disable；第1位IMASK代表是否屏蔽时钟中断，0代表不屏蔽，1代表屏蔽。

void plat_timer_init(void)
{
	u64 count_down = 0;
	u64 timer_ctl = 0;
	u32 cpuid = smp_get_cpu_id();

	/* Since QEMU only emulate the generic timer, we use the generic timer here */
	asm volatile ("mrs %0, cntpct_el0":"=r" (cntp_init));
	kdebug("timer init cntpct_el0 = %lu\n", cntp_init);
	asm volatile ("mrs %0, cntfrq_el0":"=r" (cntp_freq));
	kdebug("timer init cntfrq_el0 = %lu\n", cntp_freq);

	/* Calculate the tv */
	cntp_tval = (cntp_freq * TICK_MS / 1000);
	tick_per_us = cntp_freq / 1000 / 1000;
	kinfo("CPU freq %lu, set timer %lu\n", cntp_freq, cntp_tval);

	/* set the timervalue here */
	asm volatile ("msr cntp_tval_el0, %0"::"r" (cntp_tval));
	asm volatile ("mrs %0, cntp_tval_el0":"=r" (count_down));
	kdebug("timer init cntp_tval_el0 = %lu\n", count_down);

	/* Enable CNTPNSIRQ and CNTVIRQ */
	put32(core_timer_irqcntl[cpuid], INT_SRC_TIMER1 | INT_SRC_TIMER3);

	/* Set the control register */
	timer_ctl = 0 << 1 | 1;	/* IMASK = 0 ENABLE = 1 */
	asm volatile ("msr cntp_ctl_el0, %0"::"r" (timer_ctl));
	asm volatile ("mrs %0, cntp_ctl_el0":"=r" (timer_ctl));
	kdebug("timer init cntp_ctl_el0 = %lu\n", timer_ctl);
	/* enable interrupt controller */
	return;
}

抢占式的核心便是下面的一段话

ChCore记录每个线程所拥有的时间片（thread->thread_ctx->sc->budget），为了能够让线程之间轮转运行，我们应当在处理时钟中断时递减当前运行线程的时间片，并在当前运行线程的时间片耗尽时进行调度，选取新的线程运行。

	switch (irq) {
	case INT_SRC_TIMER1:
		/* CNTPNSIRQ (Physical Non-Secure timer IRQ) */
		// kinfo("handle_timer_irq\n");
		handle_timer_irq();
		return;
		// ...

        tick_delta = get_next_tick_delta(); 
        // 这里为什么不是一个固定值（时钟固定）的原因是时钟不只干定时硬中断的活
        // 还有一些其他线程的等待、睡眠之类会导致下一次时钟irq的提前
        unlock(local_sleep_list_lock);

        time_states[smp_get_cpu_id()].next_expire = current_tick + tick_delta;
        plat_handle_timer_irq(tick_delta);

        /* Current running thread in current_threads[cpuid] */
        if (current_thread) {
                BUG_ON(!current_thread->thread_ctx->sc);
                BUG_ON(current_thread->thread_ctx->sc->budget == 0);
                current_thread->thread_ctx->sc->budget--;
        } else {
                kdebug("Timer: system not runnig!\n");
        }

// kernel/arch/aarch64/irq/irq_entry.c
/* Interrupt handler for interrupts happening when in EL0. */
void handle_irq(void)
{
	plat_handle_irq();
	sched_periodic(); // 在rr policy里面就是sched
	// pb policy里面一般sched是pb fifo, 而periodic是rr 需要区分
	eret_to_thread(switch_context()); // 返回用户态
}

支持了抢占式之后，我们的用户态程序就能打破初始进程procmgr的循环真正运行了

调度的基本api​

协作式调度​

抢占式调度​

调度的基本api

协作式调度

抢占式调度