SOCKMAP or specifically "BPF_MAP_TYPE_SOCKMAP", is a type of an eBPF map. This map is an "array" - indices are integers. All this is pretty standard. The magic is in the map values - they must be TCP socket descriptors.
copy from:https://blog.cloudflare.com/sockmap-tcp-splicing-of-the-future/
也就是eBPF程序必须attach一个map,不是attach一个socket。so how to use SOCKMAP ?
sock_map = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int), sizeof(int), 2, 0)
prog_parser = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...)
prog_verdict = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...)
bpf_prog_attach(prog_parser, sock_map, BPF_SK_SKB_STREAM_PARSER)
bpf_prog_attach(prog_verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT)
先看看 bpf_create_map的作用: 创建一个map内存块
- BPF map的应用场景有几种:
- BPF程序和用户态态的交互:BPF程序运行完,得到的结果存储到map中,供用户态访问;
- BPF程序内部交互:如果BPF程序内部需要用全局变量来交互,但是由于安全原因BPF程序不允许访问全局变量,可以使用map来充当全局变量;
- BPF Tail call:Tail call是一个BPF程序跳转到另一BPF程序,BPF程序首先通过BPF_MAP_TYPE_PROG_ARRAY类型的map来知道另一个BPF程序的指针,然后调用tail_call()的helper function来执行Tail call。
- BPF程序和内核态的交互:和BPF程序以外的内核程序交互,也可以使用map作为中介;
- Map 类型(
map_type
),就是上文提到的各种 Map 类型 - Map 的键大小(
key_size
),以字节为单位 - Map 的值大小(
value_size
),以字节为单位 - Map 的元素最大容量(
max_entries
),个数为单位
{
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* BPF_MAP_CREATE related
* flags defined above.
*/
__u32 inner_map_fd; /* fd pointing to the inner map */
__u32 numa_node; /* numa node (effective only if
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
__u32 map_ifindex; /* ifindex of netdev to create on */
__u32 btf_fd; /* fd pointing to a BTF type data */
__u32 btf_key_type_id; /* BTF type_id of the key */
__u32 btf_value_type_id; /* BTF type_id of the value */
__u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
* struct stored as the
* map value
*/
};
---------------------------
}
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries, __u32 map_flags)
{
struct bpf_create_map_attr map_attr = {};
map_attr.map_type = map_type;//BPF_MAP_TYPE_SOCKMAP BPF_MAP_TYPE_HASH BPF_MAP_TYPE_ARRAY and so on
map_attr.map_flags = map_flags;//map的标志位
map_attr.key_size = key_size; //键值 中键的大小
map_attr.value_size = value_size;// 键值中值的大小
map_attr.max_entries = max_entries;//map键值对 最大数目
return bpf_create_map_xattr(&map_attr);
}
int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
{
union bpf_attr attr;
memset(&attr, '\0', sizeof(attr));
// 完成 bpf_attr的赋值初始化
attr.map_type = create_attr->map_type;
attr.key_size = create_attr->key_size;
attr.value_size = create_attr->value_size;
attr.max_entries = create_attr->max_entries;
attr.map_flags = create_attr->map_flags;
if (create_attr->name)
memcpy(attr.map_name, create_attr->name,
min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1));
attr.numa_node = create_attr->numa_node;
attr.btf_fd = create_attr->btf_fd;
attr.btf_key_type_id = create_attr->btf_key_type_id;
attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS)
attr.btf_vmlinux_value_type_id =
create_attr->btf_vmlinux_value_type_id;
else
attr.inner_map_fd = create_attr->inner_map_fd;
//调用bpf 系统调用 创建 一个map bpf 第一个参数为命令参数,比如: BPF_MAP_CREATE BPF_MAP_UPDATE_ELEM BPF_MAP_DELETE_ELEM
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
View Code
可以看到 实际上 会调用一个map_create 函数 分配内存 并初始化一个map
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map_memory mem;
struct bpf_map *map;
int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
return -EINVAL;
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
return -EINVAL;
}
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ...
分配内存使用 */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
return PTR_ERR(map);
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
if (err < 0)
goto free_map;
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
map->spin_lock_off = -EINVAL;
----------------------------------------------
err = bpf_map_alloc_id(map); // 将map 和 idx-id 相关联索引
if (err)
goto free_map_sec;
err = bpf_map_new_fd(map, f_flags);// 将map 和fd 关联 一切皆文件
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put_with_uref() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
bpf_map_put_with_uref(map);
return err;
}
return err;
}
map_create 会调用:对应map_type的ops去分配内存等
以map_array为例:
static const struct bpf_map_ops array_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
};
static struct bpf_map_type_list array_type __read_mostly = {
.ops = &array_ops,
.type = BPF_MAP_TYPE_ARRAY,
};
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
u64 array_size, mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL);
if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
return ERR_PTR(-E2BIG);
/* (1.1.1) 计算value的size,key的size不用计算也不用存储,因为这里的key直接就是index */
elem_size = round_up(attr->value_size, 8);
max_entries = attr->max_entries;
/* On 32 bit archs roundup_pow_of_two() with max_entries that has
* upper most bit set in u32 space is undefined behavior due to
* resulting 1U << 32, so do it manually here in u64 space.
*/
mask64 = fls_long(max_entries - 1);
mask64 = 1ULL << mask64;
mask64 -= 1;
index_mask = mask64;
if (unpriv) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
max_entries = index_mask + 1;
/* Check for overflows. */
if (max_entries < attr->max_entries)
return ERR_PTR(-E2BIG);
}
/* (1.1.2) 计算bpf_array + value数组的总大小,bpf_array包含了map的通用结构bpf_map */
array_size = sizeof(*array);
if (percpu)
array_size += (u64) max_entries * sizeof(void *);
else
array_size += (u64) max_entries * elem_size;
/* make sure there is no u32 overflow later in round_up() */
if (array_size >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
/* allocate all map elements and zero-initialize them */
/* (1.1.3) 根据总大小,分配bpf_array空间 */
array = bpf_map_area_alloc(array_size);
if (!array)
return ERR_PTR(-ENOMEM);
array->index_mask = index_mask;
array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
/* (1.1.4) 拷贝attr到array->map中 */
array->map.map_type = attr->map_type;
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
array->elem_size = elem_size;
if (!percpu)
goto out;
array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
if (array_size >= U32_MAX - PAGE_SIZE ||
elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
out:
array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
return &array->map;
}
View Code
bpf_load_program:用BPF_PROG_LOAD命令进行bpf系统调用加载 BPF 程序到内核中
- 拷贝程序到内核;
- 校验它的安全性;
- 如果可能对它进行JIT编译;
- 然后分配一个文件句柄fd给它
完成这一切后,后续再把这段BPF程序挂载到需要运行的钩子上面。
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
int err;
char license[128];
bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_TEST_RND_HI32))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
!bpf_capable())
return -EPERM;
/* copy eBPF program license from user space
根据attr->license地址,从用户空间拷贝license字符串到内核 */
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions
判断license是否符合GPL协议*/
is_gpl = license_is_gpl_compatible(license);
//判断BPF的总指令数是否超过BPF_MAXINSNS(4k)
if (attr->insn_cnt == 0 ||
attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
//对BPF_PROG_TYPE_SOCKET_FILTER和BPF_PROG_TYPE_CGROUP_SKB以外的BPF程序加载,需要管理员权限
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
!bpf_capable())
return -EPERM;
//对 CGROUP SOCK等需要admin 权限 或者 对应net 空间的权限
if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attr->attach_btf_id,
attr->attach_prog_fd))
return -EINVAL;
/* plain bpf_prog allocation 根据BPF指令数分配bpf_prog空间,和bpf_prog->aux空间*/
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
prog->expected_attach_type = attr->expected_attach_type;
prog->aux->attach_btf_id = attr->attach_btf_id;
if (attr->attach_prog_fd) {
struct bpf_prog *tgt_prog;
tgt_prog = bpf_prog_get(attr->attach_prog_fd);
if (IS_ERR(tgt_prog)) {
err = PTR_ERR(tgt_prog);
goto free_prog_nouncharge;
}
prog->aux->linked_prog = tgt_prog;
}
prog->aux->offload_requested = !!attr->prog_ifindex;
err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
err = bpf_prog_charge_memlock(prog);
if (err)
goto free_prog_sec;
prog->len = attr->insn_cnt;
err = -EFAULT;//把BPF代码从用户空间地址attr->insns,拷贝到内核空间地址prog->insns
if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
bpf_prog_insn_size(prog)) != 0)
goto free_prog;
prog->orig_prog = NULL;
prog->jited = 0;
atomic64_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog;
}
/* find program type: socket_filter vs tracing_filter
根据attr->prog_type指定的type值,找到对应的bpf_prog_types,
给bpf_prog->aux->ops赋值,这个ops是一个函数操作集*/
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
goto free_prog;
/* run eBPF verifier 使用verifer对BPF程序进行合法性扫描 */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;
/*尝试对BPF程序进行JIT转换*/
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
//给BPF程序分配关联一个idx id索引
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
/* Upon success of bpf_prog_alloc_id(), the BPF prog is
* effectively publicly exposed. However, retrieving via
* bpf_prog_get_fd_by_id() will take another reference,
* therefore it cannot be gone underneath us.
*
* Only for the time /after/ successful bpf_prog_new_fd()
* and before returning to userspace, we might just hold
* one reference and any parallel close on that fd could
* rip everything out. Hence, below notifications must
* happen before bpf_prog_new_fd().
*
* Also, any failure handling from this point onwards must
* be using bpf_prog_put() given the program is exposed.
*/
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_LOAD);
//给BPF程序分配一个文件句柄fd
err = bpf_prog_new_fd(prog);
if (err < 0)
bpf_prog_put(prog);
return err;
--------------------------------
}
bpf_prog_attach:如何把我的bpf程序,attach到这些类型上:
重定向程序作为BPF_SK_SKB_STREAM_VERDICT附加到sockmap; 它应返回bpf_sk_redirect_map()的结果。
一个strparser程序通过BPF_SK_SKB_STREAM_PARSER附加,并且应返回已解析数据的长度。
能够获取什么样的context?
指向包含包元数据/数据的结构__sk_buff的指针。 但是,sk_skb程序类型可以访问更多字段。 可用的额外字段集记录在include / linux / bpf.h中,如下所示:
什么时候会运行?
可以通过把BPF_SK_SKB_STREAM_PARSER 附加到sockmap上来把一个stream
parser附加到一个socket上,然后,当socket通过、bpf/sockmap.c中的smap_parse_func_strparser()
接受的时候,就会执行。BPF_SK_SKB_STREAM_VERDICT也会附加到sockmap上,它通过smap_verdict_func()来执行。
/* bpf_load_program
bpf_prog_attach(verdict_prog, map_fd, BPF_SMAP_STREAM_VERDICT, 0);
int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
unsigned int flags)
{
DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts,
.flags = flags,
);
return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts);
}
int bpf_prog_attach_xattr(int prog_fd, int target_fd,
enum bpf_attach_type type,
const struct bpf_prog_attach_opts *opts)
{
union bpf_attr attr;
if (!OPTS_VALID(opts, bpf_prog_attach_opts))
return -EINVAL;
memset(&attr, 0, sizeof(attr));
attr.target_fd = target_fd;
attr.attach_bpf_fd = prog_fd;
attr.attach_type = type;
attr.attach_flags = OPTS_GET(opts, flags, 0);
attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0);
return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
}
int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
struct bpf_prog *old, u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
struct bpf_prog **pprog;
switch (which) {
------------------------------------------
case BPF_SK_SKB_STREAM_PARSER:
pprog = &progs->skb_parser;
break;
case BPF_SK_SKB_STREAM_VERDICT:
pprog = &progs->skb_verdict;
break;
}
psock_set_prog(pprog, prog);
return 0;
}
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
{
u32 ufd = attr->target_fd;
struct bpf_map *map;
struct fd f;
int ret;
if (attr->attach_flags || attr->replace_bpf_fd)
return -EINVAL;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);---//找到对应的sk_psock_progs 并更新
fdput(f);
return ret;
}
*/
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;BPF_SOCK_STREAM_VERDICT
struct bpf_prog *prog = NULL;
int ret;
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL;
//------BPF_SK_SKB_STREAM_VERDICT-------> transmit -----BPF_PROG_TYPE_SK_SKB 也就是attach type 转换为 prog-type
ptype = attach_type_to_prog_type(attr->attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC)
return -EINVAL;
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
bpf_prog_put(prog);
return -EINVAL;
}
/*
const struct bpf_verifier_ops sk_skb_verifier_ops = {
.get_func_proto = sk_skb_func_proto,--------------bpf_sk_redirect_map_proto----------bpf_msg_redirect_map
.is_valid_access = sk_skb_is_valid_access,
.convert_ctx_access = sk_skb_convert_ctx_access,
.gen_prologue = sk_skb_prologue,
};
*/
switch (ptype) {
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
ret = sock_map_get_from_fd(attr, prog);// 根据target_fd 找到 map 并关联对应map
break;
case BPF_PROG_TYPE_LIRC_MODE2:
ret = lirc_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = netns_bpf_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
ret = -EINVAL;
}
if (ret)
bpf_prog_put(prog);
return ret;
}
sock_map
int val = fd;
bpf_map_update_elem(sock_map, &idx, &val, BPF_ANY);
bpf_map_update_elem: 将fd socket 和map相关联
会执行系统调用 bpf(BPF_MAP_UPDATE_ELEM,-----) 最后调用map_update_elem 函数处理
static int map_update_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);// 对应idx 索引
void __user *uvalue = u64_to_user_ptr(attr->value);//对应 键值 value 比如 需要执行动作的socket--fd
int ufd = attr->map_fd;
-----------------------
f = fdget(ufd);// map_fd--->file--->对应的map 内存
map = __bpf_map_get(f);// map_fd--->file--->对应的map 内存 f.file->private_data;
------------------------------
----------------------------------// 将 key value 更新到map 中
err = bpf_map_update_value(map, f, key, value, attr->flags);
}
static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
void *value, __u64 flags)
{
int err;
/* Need to create a kthread, thus must support schedule */
if (bpf_map_is_dev_bound(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_SOCKHASH ||
map->map_type == BPF_MAP_TYPE_SOCKMAP ||//sock_map_update_elem
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);//
}
//------------------
return err;
以sock_map_update_elem 为例查看
static int sock_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
u32 idx = *(u32 *)key;
struct socket *sock;
struct sock *sk;
int ret;
u64 ufd;
if (map->value_size == sizeof(u64))
ufd = *(u64 *)value;
else
ufd = *(u32 *)value;
---------------------------
sock = sockfd_lookup(ufd, &ret);// 根据value:sockt-fd 找到对应的struct socket
----------
sk = sock->sk;//sock---对应的net sk 结构体
-----------
ret = sock_map_update_common(map, idx, sk, flags);
}
static int sock_map_update_common(struct bpf_map *map, u32 idx,
struct sock *sk, u64 flags)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct sk_psock_link *link;
struct sk_psock *psock;
struct sock *osk;
int ret;
link = sk_psock_init_link();//分配内存
/* Only sockets we can redirect into/from in BPF need to hold
* refs to parser/verdict progs and have their sk_data_ready
* and sk_write_space callbacks overridden.
*/
ret = sock_map_link(map, &stab->progs, sk);
psock = sk_psock(sk);
WARN_ON_ONCE(!psock);
raw_spin_lock_bh(&stab->lock);
osk = stab->sks[idx];
sock_map_add_link(psock, link, map, &stab->sks[idx]);
stab->sks[idx] = sk;
sock_map_unref(osk, &stab->sks[idx]);
return 0;
}
static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
struct sock *sk)
{
struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
struct sk_psock *psock;
bool skb_progs;
int ret;
skb_verdict = READ_ONCE(progs->skb_verdict);-------赋值见-sock_map_prog_update
skb_parser = READ_ONCE(progs->skb_parser);
skb_progs = skb_parser && skb_verdict;
---------------------
msg_parser = READ_ONCE(progs->msg_parser);
------------------
psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
goto out_progs;
}
-------------------
psock = sk_psock_init(sk, map->numa_node);
将sk 和psock 相关联:创建psock ;psock->sk = sk;
---------------------
//主要是sk->sk_prot=ops 替换sk 的ops 函数;替换为bpf_ops
ret = sock_map_init_proto(sk, psock);
if (ret < 0)
goto out_drop;
if (skb_progs && !psock->parser.enabled) {
ret = sk_psock_init_strp(sk, psock);//设置strparser cb 回调函数
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
goto out_drop;
}
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
psock_set_prog(&psock->progs.skb_parser, skb_parser);
//设置 sk 的data_ready 数据到达唤醒函数
sk_psock_start_strp(sk, psock);
}
return 0;
}
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
if (parser->enabled)
return;
parser->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_strp_data_ready;
sk->sk_write_space = sk_psock_write_space;
parser->enabled = true;
}
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
{
static const struct strp_callbacks cb = {
.rcv_msg = sk_psock_strp_read,
.read_sock_done = sk_psock_strp_read_done,
.parse_msg = sk_psock_strp_parse,
};
psock->parser.enabled = false;
return strp_init(&psock->parser.strp, sk, &cb);
}
设置strparser cb 回调函数
int strp_init(struct strparser *strp, struct sock *sk,
const struct strp_callbacks *cb)
{
--------------------
/* The sk (sock) arg determines the mode of the stream parser.
*
* If the sock is set then the strparser is in receive callback mode.
* The upper layer calls strp_data_ready to kick receive processing
* and strparser calls the read_sock function on the socket to
* get packets.
*
* If the sock is not set then the strparser is in general mode.
* The upper layer calls strp_process for each skb to be parsed.
*/
---------------
memset(strp, 0, sizeof(*strp));
strp->sk = sk;
strp->cb.lock = cb->lock ? : strp_sock_lock;
strp->cb.unlock = cb->unlock ? : strp_sock_unlock;
strp->cb.rcv_msg = cb->rcv_msg;
strp->cb.parse_msg = cb->parse_msg;
strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;
INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout);
INIT_WORK(&strp->work, strp_work);
return 0;
}
static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
struct proto *base)
{
prot[TCP_BPF_BASE] = *base;
prot[TCP_BPF_BASE].unhash = sock_map_unhash;
prot[TCP_BPF_BASE].close = sock_map_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
}
struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
{
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
if (!psock->sk_proto) {
struct proto *ops = READ_ONCE(sk->sk_prot);
if (tcp_bpf_assert_proto_ops(ops))
return ERR_PTR(-EINVAL);
tcp_bpf_check_v6_needs_rebuild(sk, ops);
}
return &tcp_bpf_prots[family][config];
}
static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
{
struct proto *prot;
switch (sk->sk_type) {
case SOCK_STREAM:
prot = tcp_bpf_get_proto(sk, psock);
break;
case SOCK_DGRAM:
prot = udp_bpf_get_proto(sk, psock);
break;
sk_psock_update_proto(sk, psock, prot);
return 0;
}
sd
SEC("prog_parser")
int _prog_parser(struct __sk_buff *skb)
{
return skb->len;
}
SEC("prog_verdict")
int _prog_verdict(struct __sk_buff *skb)
{
uint32_t idx = 0;
return bpf_sk_redirect_map(skb, &sock_map, idx, 0);
}
bpf_sk_redirect_map
tells the kernel: for the received packet, please oh please redirect it from a receive queue of some socket,to a transmit queue of the socket living in sock_map under index 0. In our case, these are the same sockets!Here we achieved exactly what the echo server is supposed to do, but purely in eBPF.
const struct bpf_func_proto bpf_sk_redirect_map_proto = {
.func = bpf_sk_redirect_map,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
struct bpf_map *, map, u32, key, u64, flags)
{
struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
sk = __sock_map_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
return SK_PASS;
}
参考学习:
eBPF学习用例:
Linux 内核观测技术 BPF书籍
https://davidlovezoe.club/wordpress/archives/862
http://arthurchiao.art/blog/cilium-life-of-a-packet-pod-to-service-zh/
https://switch-router.gitee.io/blog/strparser/
https://davidlovezoe.club/wordpress/archives/963
https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/
https://jishuin.proginn.com/p/763bfbd2bc4e
https://github.com/zoidbergwill/awesome-ebpf
https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/
https://switch-router.gitee.io/blog/strparser/
https://blogs.oracle.com/linux/notes-on-bpf-1
总结:
- eBPF程序处理截获报文的例子:psock,psock 使用 strpaser,将数据包的控制权转移到 eBPF 处理程序,用户可以在 eBPF 程序里完成网络报文的重定向;sockmap 建立在 psock 之上,而 psock 的底层则是 strparser
strparser 的工作原理
核心数据结构:struct strparser 是 strparser 框架的核心数据结构,它绑定(attach)一个 TCP sock 结构 sk 和一组回调函数 cb
struct strparser {
struct sock *sk;
// code omitted ....
struct strp_callbacks cb;
};
回调函数一共有以下六个:
struct strp_callbacks {
int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb);
int (*read_sock_done)(struct strparser *strp, int err)
void (*abort_parser)(struct strparser *strp, int err);
void (*lock)(struct strparser *strp);
void (*unlock)(struct strparser *strp);
};
parse_msg() 在 strpaser 收到报文时被框架调用。它用于从报文中提取下一个应用层消息(message)的长度。一个 TCP 报文里可能不止一个应用层消息,而 parse_msg() 就是提供给使用者去识别各个消息的手段
strpaser 截获报文
正常情况下,内核 TCP 层处理报文后,会调用 sock->sk_data_ready(sk) , 它的默认动作是 wake up 一个用户态进程.
void tcp_data_ready(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
// code omitted
sk->sk_data_ready(sk);
}
我们期望报文能进入 strpaser ,但报文显然不会平白无故地地进入 strpaser ,因此,我们需要在报文的上送路径上动一些手脚:替换掉 sk->sk_data_ready 函数
static int tls_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len){
// code omitted
tsk->saved_sk_data_ready = tsk->socket->sk->sk_data_ready;
tsk->saved_sk_write_space = tsk->socket->sk->sk_write_space;sk_write_space
tsk->socket->sk->sk_data_ready = tls_data_ready;
tsk->socket->sk->sk_write_space = tls_write_space;
tsk->socket->sk->sk_user_data = tsk;
// code omitted
}
在 psock 的例子中, sk_psock_strp_data_ready() 被赋值到 sk->sk_data_ready
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
// code omitted
parser->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_strp_data_ready;
sk->sk_write_space = sk_psock_write_space;
parser->enabled = true;
}
替换之后,当有 TCP 报文准备上送时,用户定义的 sk->sk_data_ready 函数就会被调用,在该函数中,KTLS/psock 需要调用框架函数strp_data_ready() 将报文转交给 strpaser 框架。
对 KTLS
static void tls_data_ready(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
strp_data_ready(&ctx->strp);
}
View Code
对 psock
static void sk_psock_strp_data_ready(struct sock *sk)
{
struct sk_psock *psock;
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock)) {
write_lock_bh(&sk->sk_callback_lock);
strp_data_ready(&psock->parser.strp);
write_unlock_bh(&sk->sk_callback_lock);
}
rcu_read_unlock();
}
strpaser 处理报文
strpaser 框架拿到报文之后,通常会依次调用用户设置的 parse_msg 和 rcv_msg 回调函数,用户在回调函数里用来决定报文应该何去何从
strp_data_ready
|- strp_read_sock
|- tcp_read_sock
|- strp_recv
|- __strp_recv
|- strp->cb.parse_msg(strp, head)
...
|- strp->cb.rcv_msg(strp, head);
比如对 KTLS, 就是将报文上送给应用层(AF_KTLS socket)
static void tls_queue(struct strparser *strp, struct sk_buff *skb)
{
struct tls_sock *tsk;
// code omitted
tsk = strp->sk->sk_user_data;
// code omitted
ret = sock_queue_rcv_skb((struct sock *)tsk, skb);
// code omitted
}
而对于 psock, 则是运行 eBPF 程序,得到动作(verdict)。
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock = sk_psock_from_strp(strp);
struct bpf_prog *prog;
int ret = __SK_DROP;
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb_orphan(skb);
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb); // if we rdir , return SK_PASS
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
}
rcu_read_unlock();
sk_psock_verdict_apply(psock, skb, ret);
strpaser 是这个框架只是限定如何处理报文,而只是在内核层面提供给了用户一个提前处理 TCP 报文的时机和一组回调函数,用户通过不同的回调函数可以实现不同的逻辑。
https://switch-router.gitee.io/blog/strparser/-----------------------------------------*************************------------------------------------------------------
http代理服务器(3-4-7层代理)-网络事件库公共组件、内核kernel驱动 摄像头驱动 tcpip网络协议栈、netfilter、bridge 好像看过!!!! 但行好事 莫问前程 --身高体重180的胖子