summaryrefslogtreecommitdiff
path: root/tools/lib/bpf/bpf.c
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2018-03-31 02:17:57 +0200
committerDaniel Borkmann <daniel@iogearbox.net>2018-03-31 02:18:07 +0200
commit7828f20e3779e4e85e55371e0e43f5006a15fb41 (patch)
tree48f4977b0b8e69bd6432b18556ad9ac7ca7728eb /tools/lib/bpf/bpf.c
parent807ae7daf5fb9ba9ef688344ae7c0d8cbebd211c (diff)
parent1d436885b23bf4474617914d7eb15e039c83ed99 (diff)
Merge branch 'bpf-cgroup-bind-connect'
Andrey Ignatov says: ==================== v2->v3: - rebase due to conflicts - fix ipv6=m build v1->v2: - support expected_attach_type at prog load time so that prog (incl. context accesses and calls to helpers) can be validated with regard to specific attach point it is supposed to be attached to. Later, at attach time, attach type is checked so that it must be same as at load time if it was provided - reworked hooks to rely on expected_attach_type, and reduced number of new prog types from 6 to just 1: BPF_PROG_TYPE_CGROUP_SOCK_ADDR - reused BPF_PROG_TYPE_CGROUP_SOCK for sys_bind post-hooks - add selftests for post-sys_bind hook For our container management we've been using complicated and fragile setup consisting of LD_PRELOAD wrapper intercepting bind and connect calls from all containerized applications. Unfortunately it doesn't work for apps that don't use glibc and changing all applications that run in the datacenter is not possible due to 3rd party code and libraries (despite being open source code) and sheer amount of legacy code that has to be rewritten (we're rewriting what we can in parallel) These applications are written without containers in mind and have builtin assumptions about network services. Like an application X expects to connect localhost:special_port and find service Y in there. To move application X and service Y into two different containers LD_PRELOAD approach is used to help one service connect to another without rewriting them. Moving these two applications into different L2 (netns) or L3 (vrf) network isolation scopes doesn't help to solve the problem, since applications need to see each other like they were running on the host without containers. So if app X and app Y would run in different netns something would need to punch a connectivity hole in those namespaces. That would be real layering violation (with corresponding network debugging pains), since clean l2, l3 abstraction would suddenly support something that breaks through the layers. Instead we used LD_PRELOAD (and now bpf programs) at bind/connect time to help applications discover and connect to each other. All applications are running in init_nens and there are no vrfs. After bind/connect the normal fib/neighbor core networking logic works as it should always do and the whole system is clean from network point of view and can be debugged with standard tools. We also considered resurrecting Hannes's afnetns work, but all hierarchical namespace abstraction don't work due to these builtin networking assumptions inside the apps. To run an application inside cgroup container that was not written with containers in mind we have to make an illusion of running in non-containerized environment. In some cases we remember the port and container id in the post-bind hook in a bpf map and when some other task in a different container is trying to connect to a service we need to know where this service is running. It can be remote and can be local. Both client and service may or may not be written with containers in mind and this sockaddr rewrite is providing connectivity and load balancing feature. BPF+cgroup looks to be the best solution for this problem. Hence we introduce 3 hooks: - at entry into sys_bind and sys_connect to let bpf prog look and modify 'struct sockaddr' provided by user space and fail bind/connect when appropriate - post sys_bind after port is allocated The approach works great and has zero overhead for anyone who doesn't use it and very low overhead when deployed. Different use case for this feature is to do low overhead firewall that doesn't need to inspect all packets and works at bind/connect time. ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'tools/lib/bpf/bpf.c')
-rw-r--r--tools/lib/bpf/bpf.c44
1 files changed, 29 insertions, 15 deletions
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index e0500055f1a6..acbb3f8b3bec 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -146,26 +146,30 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
-1);
}
-int bpf_load_program_name(enum bpf_prog_type type, const char *name,
- const struct bpf_insn *insns,
- size_t insns_cnt, const char *license,
- __u32 kern_version, char *log_buf,
- size_t log_buf_sz)
+int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
+ char *log_buf, size_t log_buf_sz)
{
- int fd;
union bpf_attr attr;
- __u32 name_len = name ? strlen(name) : 0;
+ __u32 name_len;
+ int fd;
+
+ if (!load_attr)
+ return -EINVAL;
+
+ name_len = load_attr->name ? strlen(load_attr->name) : 0;
bzero(&attr, sizeof(attr));
- attr.prog_type = type;
- attr.insn_cnt = (__u32)insns_cnt;
- attr.insns = ptr_to_u64(insns);
- attr.license = ptr_to_u64(license);
+ attr.prog_type = load_attr->prog_type;
+ attr.expected_attach_type = load_attr->expected_attach_type;
+ attr.insn_cnt = (__u32)load_attr->insns_cnt;
+ attr.insns = ptr_to_u64(load_attr->insns);
+ attr.license = ptr_to_u64(load_attr->license);
attr.log_buf = ptr_to_u64(NULL);
attr.log_size = 0;
attr.log_level = 0;
- attr.kern_version = kern_version;
- memcpy(attr.prog_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+ attr.kern_version = load_attr->kern_version;
+ memcpy(attr.prog_name, load_attr->name,
+ min(name_len, BPF_OBJ_NAME_LEN - 1));
fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
if (fd >= 0 || !log_buf || !log_buf_sz)
@@ -184,8 +188,18 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
__u32 kern_version, char *log_buf,
size_t log_buf_sz)
{
- return bpf_load_program_name(type, NULL, insns, insns_cnt, license,
- kern_version, log_buf, log_buf_sz);
+ struct bpf_load_program_attr load_attr;
+
+ memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
+ load_attr.prog_type = type;
+ load_attr.expected_attach_type = 0;
+ load_attr.name = NULL;
+ load_attr.insns = insns;
+ load_attr.insns_cnt = insns_cnt;
+ load_attr.license = license;
+ load_attr.kern_version = kern_version;
+
+ return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
}
int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,