多核调度
调度的基本api
调度队列初始化
十分简单就直接放源码了
int rr_sched_init(void)
{
int i = 0;
/* Initialize global variables */
for (i = 0; i < PLAT_CPU_NUM; i++) {
init_list_head(&(rr_ready_queue_meta[i].queue_head));
lock_init(&(rr_ready_queue_meta[i].queue_lock));
rr_ready_queue_meta[i].queue_len = 0;
}
return 0;
}
内核初始化过程结束之后会调用
create_root_thread
来创建第一个用户态进程及线程,在create_root_thread
最后会调用sched_enqueue
函数将创建的线程加入调度队列之中。sched_enqueue
最终会调用kernel/sched/policy_rr.c中定义的rr_sched_enqueue
函数。该函数首先挑选合适的CPU核心的就绪队列(考虑线程是否绑核以及各个CPU核心之间的负载均衡),然后调用__rr_sched_enqueue
将线程插入到选中的就绪队列中。
挑选合适的就绪队列在rr策略之中是一个简单的算法:如果当前队列长度没有达到负载均衡的阈值,就选择当前的cpu,如果达到阈值了,就选择任务数量最少的cpu,并增加队列长度
这里的阈值可调节,但还是属于非常简单的策略,同时这里对于每个cpu的meta data也是属于一把大锁保平安的场景,并没有进行什么优化
又因为是rr, 所以优先级并没有起什么作用,简单判断后就直接list append
int __rr_sched_enqueue(struct thread *thread, int cpuid)
{
if (thread->thread_ctx->type == TYPE_IDLE) {
return 0;
}
/* Already in the ready queue */
if (thread_is_ts_ready(thread)) {
return -EINVAL;
}
thread->thread_ctx->cpuid = cpuid;
thread_set_ts_ready(thread);
obj_ref(thread);
list_append(&(thread->ready_queue_node),
&(rr_ready_queue_meta[cpuid].queue_head));
rr_ready_queue_meta[cpuid].queue_len++;
return 0;
}
同理,出队操作也就是很简单的api
/*
* remove @thread from its current residual ready queue
*/
int rr_sched_dequeue(struct thread *thread)
{
BUG_ON(!thread);
BUG_ON(!thread->thread_ctx);
/* IDLE thread will **not** be in any ready queue */
BUG_ON(thread->thread_ctx->type == TYPE_IDLE);
unsigned int cpuid = 0;
int ret = 0;
cpuid = thread->thread_ctx->cpuid;
lock(&(rr_ready_queue_meta[cpuid].queue_lock));
ret = __rr_sched_dequeue(thread);
unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
return ret;
}
选择可调度的线程比较复杂一些,由于涉及到了fast path优化来尽量减少锁开销
具体而言,在选取一个线程之前,先做了一次判空,如果准备队列是空就直接返回idle线程
不空的情况下再尝试拿锁,减少轻工作负载下的锁竞争
拿锁后遍历自己核心上的ready queue,找一个当前状态为free的或者就是当前的thread,如果没找到则返回idle
(按照注释的说法这个好像和多核下线程在不同核心上的调度有关,虽然线程在cpu核心之间可以通过负载均衡策略进行分配,但是线程相关的内核上下文(如内核栈)还需要时间迁移和释放(例如正在执行系统调用和中断),所以会出现ready queue没一个free的,是因为在fast path 判断到拿锁的时 候被中断等改变状态了吗)
/*
* Choose an appropriate thread and dequeue from ready queue
*/
struct thread *rr_sched_choose_thread(void)
{
unsigned int cpuid = smp_get_cpu_id();
struct thread *thread = NULL;
if (!list_empty(&(rr_ready_queue_meta[cpuid].queue_head))) {
lock(&(rr_ready_queue_meta[cpuid].queue_lock));
again:
if (list_empty(&(rr_ready_queue_meta[cpuid].queue_head))) {
unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
goto out;
}
/*
* When the thread is just moved from another cpu and
* the kernel stack is used by the origina core, try
* to find another thread.
*/
if (!(thread = find_runnable_thread(
&(rr_ready_queue_meta[cpuid].queue_head)))) {
unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
goto out;
}
BUG_ON(__rr_sched_dequeue(thread));
if (thread_is_exiting(thread) || thread_is_exited(thread)) {
/* Thread need to exit. Set the state to TE_EXITED */
thread_set_exited(thread);
goto again;
}
unlock(&(rr_ready_queue_meta[cpuid].queue_lock));
return thread;
}
out:
return &idle_threads[cpuid];
}
/*
* find_runnable_thread
* ** Shoule hold a dedicated lock for the thread_list and this function can
* only be called in the critical section!! ** Only the thread which kernel
* state is free can be choosed
*/
struct thread *find_runnable_thread(struct list_head *thread_list)
{
struct thread *thread;
for_each_in_list (
thread, struct thread, ready_queue_node, thread_list) {
if (!thread_is_suspend(thread)
&& (thread->thread_ctx->kernel_stack_state == KS_FREE
|| thread == current_thread)) {
return thread;
}
}
return NULL;
}
至于rr_sched本身则是如下的逻辑
如果没有old, 就choose one
如果有old, 查看old的类型,有些ipc相关的thread type需要给调度的新线程传递context
查看线程自己的时间片( sc->budget,sc是schedular context
) 是否耗尽,如果没耗尽就继续
耗尽则切换到新的线程
切换的过程本身是很简单的,毕竟此时还没有返回给用户态,只是修改current_thread相关的一些元数据,并在新的线程上设置prev_thread
(返回用户态的实例在sched_to_thread, 内部比较复杂,可能跨核调度,此时需要通过ipi(体系结构特定的处理器间中断)来等待目标cpu核处理好当前的中断等事件释放内核栈, 再进行体系结构特定的上下文切换(寄存器的save/load)和用户态返回)
/*
* Schedule a thread to execute.
* current_thread can be NULL, or the state is TS_RUNNING or
* TS_WAITING/TS_BLOCKING. This function will suspend current running thread, if
* any, and schedule another thread from
* `(rr_ready_queue_meta[cpuid].queue_head)`.
* ***the following text might be outdated***
* 1. Choose an appropriate thread through calling *chooseThread* (Simple
* Priority-Based Policy)
* 2. Update current running thread and left the caller to restore the executing
* context
*/
int rr_sched(void)
{
/* WITH IRQ Disabled */
struct thread *old = current_thread;
struct thread *new = 0;
if (old) {
BUG_ON(!old->thread_ctx);
/* old thread may pass its scheduling context to others. */
if (old->thread_ctx->type != TYPE_SHADOW
&& old->thread_ctx->type != TYPE_REGISTER) {
BUG_ON(!old->thread_ctx->sc);
}
/* Set TE_EXITING after check won't cause any trouble, the
* thread will be recycle afterwards. Just a fast path. */
/* Check whether the thread is going to exit */
if (thread_is_exiting(old)) {
/* Set the state to TE_EXITED */
thread_set_exited(old);
}
/* check old state */
if (!thread_is_exited(old)) {
if (thread_is_ts_running(old)) {
/* A thread without SC should not be TS_RUNNING.
*/
BUG_ON(!old->thread_ctx->sc);
if (old->thread_ctx->sc->budget != 0
&& !thread_is_suspend(old)) {
switch_to_thread(old);
return 0; /* no schedule needed */
}
rr_sched_refill_budget(old, DEFAULT_BUDGET);
BUG_ON(rr_sched_enqueue(old) != 0);
} else if (!thread_is_ts_blocking(old)
&& !thread_is_ts_waiting(old)) {
kinfo("thread state: %d\n",
old->thread_ctx->state);
BUG_ON(1);
}
}
}
BUG_ON(!(new = rr_sched_choose_thread()));
switch_to_thread(new);
return 0;
}
协作式调度
有了上文的api之后非常简单!
实现协作式调度只需要让用户态能够发起放弃线程的情况,也就是yield的syscall就好了
内核里面就简单的将他的时间片清空,用sched(这是policy类的基类的“虚函数”, 此处会被“重载”为rr_sched)切换线程,再eret就好了
/* SYSCALL functions */
void sys_yield(void)
{
current_thread->thread_ctx->sc->budget = 0;
sched();
eret_to_thread(switch_context());
}
其实switch_context里面还是比较复杂的,需要切换很多东西
- vmspace 这个结构在thread里面(还记得这的核心数据机构是一个rbtree的root吗, 回看内存管理),但是需要切换页表(页表的地址也在vmspace之中维护)
- tls(thread local storage, 在arm架构的典型实现之中是TPIDR_EL0寄存器,它存着一个线程特定的标识符)
- anyway,OS其实不知道这里面存的是什么东西,他只是把这个寄存器当成线程特定的标识符,并在自己的线程实现之中维护而已。至于语言层面的tls如何实现,那是编译器开发者或者库开发者的事情,例如存一个特定的空间的指针(例如小的在栈上, 大的在堆上)
- arm compiler的支持文档 https://developer.arm.com/documentation/dui0205/g/arm-compiler-reference/compiler-specific-features/thread-local-storage
- 对tls的一些讨论 https://forum.osdev.org/viewtopic.php?t=36597
- fpu 相关
- 其他想要添加的机制,例如保存和清理TLB的一些数据(history) …
- 把切换的线程相关的寄存器设置到cpu上
具体代码如下
/*
* This function is used after current_thread is set (a new thread needs to be
* scheduled).
*
* Switch context between current_thread and current_thread->prev_thread:
* including: vmspace, fpu, tls, ...
*
* Return the context pointer which should be set to stack pointer register.
*/
vaddr_t switch_context(void)
{
struct thread *target_thread;
struct thread_ctx *target_ctx;
struct thread *prev_thread;
target_thread = current_thread;
if (!target_thread || !target_thread->thread_ctx) {
kwarn("%s no thread_ctx", __func__);
return 0;
}
target_ctx = target_thread->thread_ctx;
prev_thread = target_thread->prev_thread;
if (prev_thread == THREAD_ITSELF)
return (vaddr_t)target_ctx;
#if FPU_SAVING_MODE == EAGER_FPU_MODE
save_fpu_state(prev_thread);
restore_fpu_state(target_thread);
#else
/* FPU_SAVING_MODE == LAZY_FPU_MODE */
if (target_thread->thread_ctx->type > TYPE_KERNEL)
disable_fpu_usage();
#endif
/* Switch the TLS information: save and restore */
switch_tls_info(prev_thread, target_thread);
#ifndef CHCORE_KERNEL_TEST
BUG_ON(!target_thread->vmspace);
/*
* Recording the CPU the thread runs on: for TLB maintainence.
* switch_context is always required for running a (new) thread.
* So, we invoke record_running_cpu here.
*/
record_history_cpu(target_thread->vmspace, smp_get_cpu_id());
if ((!prev_thread) || (prev_thread->vmspace != target_thread->vmspace))
switch_thread_vmspace_to(target_thread);
#else /* CHCORE_KERNEL_TEST */
/* TYPE_TESTS threads do not have vmspace. */
if (target_thread->thread_ctx->type != TYPE_TESTS) {
BUG_ON(!target_thread->vmspace);
record_history_cpu(target_thread->vmspace, smp_get_cpu_id());
switch_thread_vmspace_to(target_thread);
}
#endif /* CHCORE_KERNEL_TEST */
arch_switch_context(target_thread);
return (vaddr_t)target_ctx;
}
void arch_switch_context(struct thread *target)
{
struct per_cpu_info *info;
info = get_per_cpu_info();
/* Set the `cur_exec_ctx` in the per_cpu info. */
info->cur_exec_ctx = (u64)target->thread_ctx;
}
抢占式调度
抢占式首先要支持的就是时钟中断
而时钟中断的支持实际上和其他外设也没什么区别,抽象起来就是对寄存器的读和写,以及配置真正连接cpu引脚的那个部分到底什么时候发信号
引用文档
Arm Generic Timer 用到的寄存器
- CNTPCT_EL0: 它的值代表了当前的 system count。
- CNTFRQ_EL0: 它的值代表了物理时钟运行的频率,即每秒钟 system count 会增加多少。
- CNTP_CVAL_EL0: 是一个64位寄存器,操作系统可以向该寄存器写入一个值,当 system count 达到或超过该值时,物理时钟会触发中断。
- CNTP_TVAL_EL0: 是一个32位寄存器,操作系统可以写入 TVAL,处理器会在内部读取当前的系统计数,加上写入的值,然后填充 CVAL。
- CNTP_CTL_EL0: 物理时钟的控制寄存器,第0位ENABLE控制时钟是否开启,1代表enble,0代表disable;第1位IMASK代表是否屏蔽时钟中断,0代表不屏蔽,1代表屏蔽。
void plat_timer_init(void)
{
u64 count_down = 0;
u64 timer_ctl = 0;
u32 cpuid = smp_get_cpu_id();
/* Since QEMU only emulate the generic timer, we use the generic timer here */
asm volatile ("mrs %0, cntpct_el0":"=r" (cntp_init));
kdebug("timer init cntpct_el0 = %lu\n", cntp_init);
asm volatile ("mrs %0, cntfrq_el0":"=r" (cntp_freq));
kdebug("timer init cntfrq_el0 = %lu\n", cntp_freq);
/* Calculate the tv */
cntp_tval = (cntp_freq * TICK_MS / 1000);
tick_per_us = cntp_freq / 1000 / 1000;
kinfo("CPU freq %lu, set timer %lu\n", cntp_freq, cntp_tval);
/* set the timervalue here */
asm volatile ("msr cntp_tval_el0, %0"::"r" (cntp_tval));
asm volatile ("mrs %0, cntp_tval_el0":"=r" (count_down));
kdebug("timer init cntp_tval_el0 = %lu\n", count_down);
/* Enable CNTPNSIRQ and CNTVIRQ */
put32(core_timer_irqcntl[cpuid], INT_SRC_TIMER1 | INT_SRC_TIMER3);
/* Set the control register */
timer_ctl = 0 << 1 | 1; /* IMASK = 0 ENABLE = 1 */
asm volatile ("msr cntp_ctl_el0, %0"::"r" (timer_ctl));
asm volatile ("mrs %0, cntp_ctl_el0":"=r" (timer_ctl));
kdebug("timer init cntp_ctl_el0 = %lu\n", timer_ctl);
/* enable interrupt controller */
return;
}
抢占式的核心便是下面的一段话
ChCore记录每个线程所拥有的时间片(
thread->thread_ctx->sc->budget
),为了能够让线程之间轮转运行,我们应当在处理时钟中断时递减当前运行线程的时间片,并在当前运行线程的时间片耗尽时进行调度,选取新的线程运行。
switch (irq) {
case INT_SRC_TIMER1:
/* CNTPNSIRQ (Physical Non-Secure timer IRQ) */
// kinfo("handle_timer_irq\n");
handle_timer_irq();
return;
// ...
tick_delta = get_next_tick_delta();
// 这里为什么不是一个固定值(时钟固定)的原因是时钟不只干定时硬中断的活
// 还有一些其他线程的等待、睡眠之类会导致下一次时钟irq的提前
unlock(local_sleep_list_lock);
time_states[smp_get_cpu_id()].next_expire = current_tick + tick_delta;
plat_handle_timer_irq(tick_delta);
/* Current running thread in current_threads[cpuid] */
if (current_thread) {
BUG_ON(!current_thread->thread_ctx->sc);
BUG_ON(current_thread->thread_ctx->sc->budget == 0);
current_thread->thread_ctx->sc->budget--;
} else {
kdebug("Timer: system not runnig!\n");
}
// kernel/arch/aarch64/irq/irq_entry.c
/* Interrupt handler for interrupts happening when in EL0. */
void handle_irq(void)
{
plat_handle_irq();
sched_periodic(); // 在rr policy里面就是sched
// pb policy里面一般sched是pb fifo, 而periodic是rr 需要区分
eret_to_thread(switch_context()); // 返回用户态
}
支持了抢占式之后,我们的用户态程序就能打破初始进程procmgr的循环真正运行了