Skip to main content

能力组管理

capability 机制

文档上只对能力组的机制做了简要的描述,但此事在Linux之中亦有记载……

ref: https://www.cnblogs.com/sparkdev/p/11417781.html

wiki上的简单名词介绍 https://en.wikipedia.org/wiki/Capability-based_operating_system

tl;dr 能力组是为安全而生的进程“部分root”的机制,例如你可以在用户态运行/bin/ping

在Chcore中所有的系统资源都叫做object(对象),用面向对象的方法进行理解的话,object即为不同内核对象例如vmspace, pmo, thread(等等)的父类, Chcore通过能力组机制管理所有的系统资源,能力组本身只是一个包含指向object的指针的数组

struct object_slot {
int slot_id;
struct cap_group *cap_group;
struct object *object;
/* link copied slots pointing to the same object */
struct list_head copies;
/* rights for object */
cap_right_t rights;
};

#define BASE_OBJECT_NUM BITS_PER_LONG
/* 1st cap is cap_group. 2nd cap is vmspace */
#define CAP_GROUP_OBJ_ID 0
#define VMSPACE_OBJ_ID 1

struct slot_table {
unsigned int slots_size;
struct object_slot **slots;
/*
* if a bit in full_slots_bmp is 1, corresponding
* sizeof(unsigned long) bits in slots_bmp are all set
*/
unsigned long *full_slots_bmp;
unsigned long *slots_bmp;
struct rwlock table_guard;
};

#define MAX_GROUP_NAME_LEN 63

struct cap_group {
struct slot_table slot_table;

/* Proctect thread_list and thread_cnt */
struct lock threads_lock;
struct list_head thread_list;
/* The number of threads */
int thread_cnt;

/*
* Each process has a unique badge as a global identifier which
* is set by the system server, procmgr.
* Currently, badge is used as a client ID during IPC.
*/
badge_t badge;
int pid;

/* Ensures the cap_group_exit function only be executed once */
int notify_recycler;

/* A ptrace object that the process attached to */
void *attached_ptrace;

/* Now is used for debugging */
char cap_group_name[MAX_GROUP_NAME_LEN + 1];

/* Each Process has its own futex status */
struct lock futex_lock;
struct htable futex_entries;

#ifdef CHCORE_OPENTRUSTEE
TEE_UUID uuid;
size_t heap_size_limit;
#endif /* CHCORE_OPENTRUSTEE */
};

#define current_cap_group (current_thread->cap_group)

看起来指针乱飞的真是乱!让我们一一拆解

文档是一个很好的入手点

capability

首先,能力组本身只是一个包含指向object的指针的数组

体现在

struct cap_group {
struct slot_table slot_table;
// ...
}

struct slot_table {
unsigned int slots_size;
struct object_slot **slots;
// bitmap to speed up and ...
};
struct object_slot {
int slot_id;
struct object *object;
cap_right_t rights;
// below: 反向的引用
struct cap_group *cap_group;
/* link copied slots pointing to the same object */
struct list_head copies;
/* rights for object */
};

这样梳理之后就清楚了

“能力”是某种形式的内核对象,例如文件描述符,网络连接,进程通信channel, …

每个进程具有一系列的内核对象的指针,并且这些指针需要一个引用计数来保证资源的释放

为了简化起见,假设所有的资源所有权都是在线程和子进程间共享的

假设父进程p持有一系列内核对象的资源,例如文件描述符

那p的线程是不是也应该有文件描述符,p的子进程是不是也应该有文件描述符?这样类比之后,就可以理解cap_group的thread_list(又是一个反向引用!thread结构体才是逻辑上“拥有资源”的一方)

ps:事实上笔者一直认为,内核c代码难读的很大一个原因在于搞不清楚指针的所有权(

细节上而言,有些内核对象需要特殊的“析构函数”,因而还能看到用传统c实现的“虚函数表”

const obj_deinit_func obj_deinit_tbl[TYPE_NR] = {
[0 ... TYPE_NR - 1] = NULL,
[TYPE_CAP_GROUP] = cap_group_deinit,
[TYPE_THREAD] = thread_deinit,
[TYPE_CONNECTION] = connection_deinit,
[TYPE_NOTIFICATION] = notification_deinit,
[TYPE_IRQ] = irq_deinit,
[TYPE_PMO] = pmo_deinit,
[TYPE_VMSPACE] = vmspace_deinit,
#ifdef CHCORE_OPENTRUSTEE
[TYPE_CHANNEL] = channel_deinit,
[TYPE_MSG_HDL] = msg_hdl_deinit,
#endif /* CHCORE_OPENTRUSTEE */
[TYPE_PTRACE] = ptrace_deinit
};

针对这样的数据结构, capability.c之中小心地实现了cap_group的复制,释放,传递等函数,之中由于基于引用计数的所有权的复杂性+并发+为了性能降低锁的粒度,代码并不是非常可读,感兴趣的读者可以自行深入研究

回到文档,下一步是来探究一下cap_group的创建

create_root_cap_group 函数

是第一个cap_group, 也是第一个用户态函数init所调用的函数

由于cap_group本身作为一个内核对象也需要包在cap_group里面,所以利用已经给的工具函数能够写出

        cap_group = obj_alloc(TYPE_CAP_GROUP, sizeof(*cap_group));
if (!cap_group) {
kwarn("failed alloc cap_group in %s\n", __func__);
return NULL;
}
cap_group_init(cap_group,
BASE_OBJECT_NUM,
/* Fixed badge */ ROOT_CAP_GROUP_BADGE);

slot_id = cap_alloc(cap_group, cap_group); // arg: *cap, *obj

申请了cap_group对象,然后把指向自己的指针放到了slot里面

有人可能会问: 这不是循环引用了吗?

我们要的正是循环引用,init进程不应该结束,它的cap_group是不应该被释放掉的。

然后我们作为第一个用户态进程需要一个vma的内核资源, 类似的逻辑

        vmspace = obj_alloc(TYPE_VMSPACE, sizeof(*vmspace));
if (!vmspace) {
kwarn("failed alloc vmspace in %s\n", __func__);
return NULL;
}

/* fixed PCID 1 for root process, PCID 0 is not used. */
vmspace_init(vmspace, ROOT_PROCESS_PCID);

slot_id = cap_alloc(cap_group, vmspace);

之后就是给cap_group简单的命名

而另一个 sys_create_cap_group,则作为一个syscall 的handler呈现

const void *syscall_table[NR_SYSCALL] = {
// ...
[CHCORE_SYS_create_cap_group] = sys_create_cap_group,

传入的 *unsigned* *long* *cap_group_args_p 就是用户态进程传递过来的在其vm下的,指向create_cap_group 这个 syscall 的参数的指针(什么长难句)*

明白了这个之后,它的逻辑就比较自然了

  1. 用户态的指针不可信,校验其地址空间是否伸进kernel,是否有创建权限
  2. copy 用户态的数据到内核(如果做过xv6的lab就可以发现, 这里的拷贝是没有传递用户态的页表的,在现代OS之中,内核态页表是包含用户态的页表项的,所以copy_from_user只是一个简单的memcpy而已)
  3. 类似上面的逻辑,alloc新的cap_group对象,cap_init初始化,根据传入的参数给几个属性赋值
  4. 新cap_group应该带上默认的cap, 即cap_group和vmspace(觉得绕的话回想cap是指针!)

具体的代码实现如下,还是不难的

cap_t sys_create_cap_group(unsigned long cap_group_args_p)
{
struct cap_group *new_cap_group;
struct vmspace *vmspace;
cap_t cap;
int r;
struct cap_group_args args = {0};

r = hook_sys_create_cap_group(cap_group_args_p);
if (r != 0) return r;

if (check_user_addr_range((vaddr_t)cap_group_args_p,
sizeof(struct cap_group_args)) != 0)
return -EINVAL;

r = copy_from_user(&args, (void *)cap_group_args_p, sizeof(struct cap_group_args));
if (r) {
return -EINVAL;
}

#ifdef CHCORE_OPENTRUSTEE
if (check_user_addr_range((vaddr_t)args.puuid,
sizeof(TEE_UUID)) != 0)
return -EINVAL;
#endif /* CHCORE_OPENTRUSTEE */

if (check_user_addr_range((vaddr_t)args.name, (size_t)args.name_len) != 0)
return -EINVAL;

/* cap current cap_group */
new_cap_group = obj_alloc(TYPE_CAP_GROUP, sizeof(*new_cap_group));
if (!new_cap_group) {
r = -ENOMEM;
goto out_fail;
}
cap_group_init(new_cap_group, BASE_OBJECT_NUM, args.badge);
new_cap_group->pid = args.pid;
#ifdef CHCORE_OPENTRUSTEE
new_cap_group->heap_size_limit = args.heap_size;
/* pid used in OH-TEE */
if (args.puuid) {
copy_from_user(&new_cap_group->uuid, (void *)args.puuid, sizeof(TEE_UUID));
} else {
memset(&new_cap_group->uuid, 0, sizeof(TEE_UUID));
}
#endif /* CHCORE_OPENTRUSTEE */

cap = cap_alloc(current_cap_group, new_cap_group);
if (cap < 0) {
r = cap;
goto out_free_obj_new_grp;
}

/* 1st cap is cap_group */
if (cap_copy(current_thread->cap_group,
new_cap_group,
cap,
CAP_RIGHT_NO_RIGHTS,
CAP_RIGHT_NO_RIGHTS)
!= CAP_GROUP_OBJ_ID) {
kwarn("%s: cap_copy fails or cap[0] is not cap_group\n", __func__);
r = -ECAPBILITY;
goto out_free_cap_grp_current;
}

/* 2st cap is vmspace */
vmspace = obj_alloc(TYPE_VMSPACE, sizeof(*vmspace));
if (!vmspace) {
r = -ENOMEM;
goto out_free_obj_vmspace;
}

vmspace_init(vmspace, args.pcid);

r = cap_alloc(new_cap_group, vmspace);
if (r != VMSPACE_OBJ_ID) {
kwarn("%s: cap_copy fails or cap[1] is not vmspace\n", __func__);
r = -ECAPBILITY;
goto out_free_obj_vmspace;
}

new_cap_group->notify_recycler = 0;

/* Set the cap_group_name (process_name) for easing debugging */
memset(new_cap_group->cap_group_name, 0, MAX_GROUP_NAME_LEN + 1);
if (args.name_len > MAX_GROUP_NAME_LEN)
args.name_len = MAX_GROUP_NAME_LEN;

r = copy_from_user(new_cap_group->cap_group_name,
(void *)args.name,
args.name_len);
if (r) {
r = -EINVAL;
goto out_free_obj_vmspace;
}

return cap;
out_free_obj_vmspace:
obj_free(vmspace);
out_free_cap_grp_current:
cap_free(current_cap_group, cap);
new_cap_group = NULL;
out_free_obj_new_grp:
obj_free(new_cap_group);
out_fail:
return r;
}