能力组管理
capability 机制
文档上只对能力组的机制做了简要的描述,但此事在Linux之中亦有记载……
ref: https://www.cnblogs.com/sparkdev/p/11417781.html
wiki上的简单名词介绍 https://en.wikipedia.org/wiki/Capability-based_operating_system
tl;dr 能力组是为安全而生的进程“部分root”的机制,例如你可以在用户态运行/bin/ping
在Chcore中所有的系统资源都叫做object(对象),用面向对象的方法进行理解的话,object即为不同内核对象例如vmspace, pmo, thread(等等)的父类, Chcore通过能力组机制管理所有的系统资源,能力组本身只是一个包含指向object的指针的数组
struct object_slot {
int slot_id;
struct cap_group *cap_group;
struct object *object;
/* link copied slots pointing to the same object */
struct list_head copies;
/* rights for object */
cap_right_t rights;
};
#define BASE_OBJECT_NUM BITS_PER_LONG
/* 1st cap is cap_group. 2nd cap is vmspace */
#define CAP_GROUP_OBJ_ID 0
#define VMSPACE_OBJ_ID 1
struct slot_table {
unsigned int slots_size;
struct object_slot **slots;
/*
* if a bit in full_slots_bmp is 1, corresponding
* sizeof(unsigned long) bits in slots_bmp are all set
*/
unsigned long *full_slots_bmp;
unsigned long *slots_bmp;
struct rwlock table_guard;
};
#define MAX_GROUP_NAME_LEN 63
struct cap_group {
struct slot_table slot_table;
/* Proctect thread_list and thread_cnt */
struct lock threads_lock;
struct list_head thread_list;
/* The number of threads */
int thread_cnt;
/*
* Each process has a unique badge as a global identifier which
* is set by the system server, procmgr.
* Currently, badge is used as a client ID during IPC.
*/
badge_t badge;
int pid;
/* Ensures the cap_group_exit function only be executed once */
int notify_recycler;
/* A ptrace object that the process attached to */
void *attached_ptrace;
/* Now is used for debugging */
char cap_group_name[MAX_GROUP_NAME_LEN + 1];
/* Each Process has its own futex status */
struct lock futex_lock;
struct htable futex_entries;
#ifdef CHCORE_OPENTRUSTEE
TEE_UUID uuid;
size_t heap_size_limit;
#endif /* CHCORE_OPENTRUSTEE */
};
#define current_cap_group (current_thread->cap_group)
看起来指针乱飞的真是乱!让我们一一拆解
文档是一个很好的入手点
首先,能力组本身只是一个包含指向object的指针的数组
体现在
struct cap_group {
struct slot_table slot_table;
// ...
}
struct slot_table {
unsigned int slots_size;
struct object_slot **slots;
// bitmap to speed up and ...
};
struct object_slot {
int slot_id;
struct object *object;
cap_right_t rights;
// below: 反向的引用
struct cap_group *cap_group;
/* link copied slots pointing to the same object */
struct list_head copies;
/* rights for object */
};
这样梳理之后就清楚了
“能力”是某种形式的内核对象,例如文件描述符,网络连接,进程通信channel, …
每个进程具有一系列的内核对象的指针,并且这些指针需要一个引用计数来保证资源的释放
为了简化起见,假设所有的资源所有权都是在线程和子进程间共享的
假设父进程p持有一系列内核对象的资源,例如文件描述符
那p的线程是不是也应该有文件描述符,p的子进程是不是也应该有文件描述符?这样类比之后,就可以理解cap_group的thread_list(又是一个反向引用!thread结构体才是逻辑上“拥有资源”的一方)
ps:事实上笔者一直认为,内核c代码难读的很大一个原因在于搞不清楚指针的所有权(
细节上而言,有些内核对象需要特殊的“析构函数”,因而还能看到用传统c实现的“虚函数表”
const obj_deinit_func obj_deinit_tbl[TYPE_NR] = {
[0 ... TYPE_NR - 1] = NULL,
[TYPE_CAP_GROUP] = cap_group_deinit,
[TYPE_THREAD] = thread_deinit,
[TYPE_CONNECTION] = connection_deinit,
[TYPE_NOTIFICATION] = notification_deinit,
[TYPE_IRQ] = irq_deinit,
[TYPE_PMO] = pmo_deinit,
[TYPE_VMSPACE] = vmspace_deinit,
#ifdef CHCORE_OPENTRUSTEE
[TYPE_CHANNEL] = channel_deinit,
[TYPE_MSG_HDL] = msg_hdl_deinit,
#endif /* CHCORE_OPENTRUSTEE */
[TYPE_PTRACE] = ptrace_deinit
};
针对这样的数据结构, capability.c之中小心地实现了cap_group的复制,释放,传递等函数,之中由于基于引用计数的所有权的复杂性+并发+为了性能降低锁的粒度,代码并不是非常可读,感兴趣的读者可以自行深入研究
回到文 档,下一步是来探究一下cap_group的创建
create_root_cap_group 函数
是第一个cap_group, 也是第一个用户态函数init所调用的函数
由于cap_group本身作为一个内核对象也需要包在cap_group里面,所以利用已经给的工具函数能够写出
cap_group = obj_alloc(TYPE_CAP_GROUP, sizeof(*cap_group));
if (!cap_group) {
kwarn("failed alloc cap_group in %s\n", __func__);
return NULL;
}
cap_group_init(cap_group,
BASE_OBJECT_NUM,
/* Fixed badge */ ROOT_CAP_GROUP_BADGE);
slot_id = cap_alloc(cap_group, cap_group); // arg: *cap, *obj
申请了cap_group对象,然后把指向自己的指针放到了slot里面
有人可能会问: 这不是循环引用了吗?
我们要的正是循环引用,init进程不应该结束,它的cap_group是不应该被释放掉的。
然后我们作为第一个用户态进程需要一个vma的内核资源, 类似的逻辑
vmspace = obj_alloc(TYPE_VMSPACE, sizeof(*vmspace));
if (!vmspace) {
kwarn("failed alloc vmspace in %s\n", __func__);
return NULL;
}
/* fixed PCID 1 for root process, PCID 0 is not used. */
vmspace_init(vmspace, ROOT_PROCESS_PCID);
slot_id = cap_alloc(cap_group, vmspace);
之后就是给cap_group简单的命名
而另一个 sys_create_cap_group,则作为一个syscall 的handler呈现
const void *syscall_table[NR_SYSCALL] = {
// ...
[CHCORE_SYS_create_cap_group] = sys_create_cap_group,
传入的 *unsigned* *long* *cap_group_args_p
就是用户态进程传递过来的在其vm下的,指向create_cap_group 这个 syscall 的参数的指针(什么长难句)*
明白了这个之后,它的逻辑就比较自然了
- 用户态的指针不可信,校验其地址空间是否伸进kernel,是否有创建权限
- copy 用户态的数据到内核(如果做过xv6的lab就可以发现, 这里的拷贝是没 有传递用户态的页表的,在现代OS之中,内核态页表是包含用户态的页表项的,所以copy_from_user只是一个简单的memcpy而已)
- 类似上面的逻辑,alloc新的cap_group对象,cap_init初始化,根据传入的参数给几个属性赋值
- 新cap_group应该带上默认的cap, 即cap_group和vmspace(觉得绕的话回想cap是指针!)
具体的代码实现如下,还是不难的
cap_t sys_create_cap_group(unsigned long cap_group_args_p)
{
struct cap_group *new_cap_group;
struct vmspace *vmspace;
cap_t cap;
int r;
struct cap_group_args args = {0};
r = hook_sys_create_cap_group(cap_group_args_p);
if (r != 0) return r;
if (check_user_addr_range((vaddr_t)cap_group_args_p,
sizeof(struct cap_group_args)) != 0)
return -EINVAL;
r = copy_from_user(&args, (void *)cap_group_args_p, sizeof(struct cap_group_args));
if (r) {
return -EINVAL;
}
#ifdef CHCORE_OPENTRUSTEE
if (check_user_addr_range((vaddr_t)args.puuid,
sizeof(TEE_UUID)) != 0)
return -EINVAL;
#endif /* CHCORE_OPENTRUSTEE */
if (check_user_addr_range((vaddr_t)args.name, (size_t)args.name_len) != 0)
return -EINVAL;
/* cap current cap_group */
new_cap_group = obj_alloc(TYPE_CAP_GROUP, sizeof(*new_cap_group));
if (!new_cap_group) {
r = -ENOMEM;
goto out_fail;
}
cap_group_init(new_cap_group, BASE_OBJECT_NUM, args.badge);
new_cap_group->pid = args.pid;
#ifdef CHCORE_OPENTRUSTEE
new_cap_group->heap_size_limit = args.heap_size;
/* pid used in OH-TEE */
if (args.puuid) {
copy_from_user(&new_cap_group->uuid, (void *)args.puuid, sizeof(TEE_UUID));
} else {
memset(&new_cap_group->uuid, 0, sizeof(TEE_UUID));
}
#endif /* CHCORE_OPENTRUSTEE */
cap = cap_alloc(current_cap_group, new_cap_group);
if (cap < 0) {
r = cap;
goto out_free_obj_new_grp;
}
/* 1st cap is cap_group */
if (cap_copy(current_thread->cap_group,
new_cap_group,
cap,
CAP_RIGHT_NO_RIGHTS,
CAP_RIGHT_NO_RIGHTS)
!= CAP_GROUP_OBJ_ID) {
kwarn("%s: cap_copy fails or cap[0] is not cap_group\n", __func__);
r = -ECAPBILITY;
goto out_free_cap_grp_current;
}
/* 2st cap is vmspace */
vmspace = obj_alloc(TYPE_VMSPACE, sizeof(*vmspace));
if (!vmspace) {
r = -ENOMEM;
goto out_free_obj_vmspace;
}
vmspace_init(vmspace, args.pcid);
r = cap_alloc(new_cap_group, vmspace);
if (r != VMSPACE_OBJ_ID) {
kwarn("%s: cap_copy fails or cap[1] is not vmspace\n", __func__);
r = -ECAPBILITY;
goto out_free_obj_vmspace;
}
new_cap_group->notify_recycler = 0;
/* Set the cap_group_name (process_name) for easing debugging */
memset(new_cap_group->cap_group_name, 0, MAX_GROUP_NAME_LEN + 1);
if (args.name_len > MAX_GROUP_NAME_LEN)
args.name_len = MAX_GROUP_NAME_LEN;
r = copy_from_user(new_cap_group->cap_group_name,
(void *)args.name,
args.name_len);
if (r) {
r = -EINVAL;
goto out_free_obj_vmspace;
}
return cap;
out_free_obj_vmspace:
obj_free(vmspace);
out_free_cap_grp_current:
cap_free(current_cap_group, cap);
new_cap_group = NULL;
out_free_obj_new_grp:
obj_free(new_cap_group);
out_fail:
return r;
}