Keep learning, keep living...

0%

Kubernetes POD环境的NFQUEUE机制

之前的文章<<Kubernetes环境中NFQUEUE与MARK机制冲突>>介绍了我们使用NFQUEUE机制将数据包送往用户态进行安全检测。之前程序逻辑是将来自虚拟网络设备的数据包直接放行。而当把逻辑修改为对POD虚拟网卡的流量也进行检测时,POD网络就无法连通了。排查发现数据包送上用户态之后,并没有收到用户态程序的裁决信息。

在对NFQUEUE的源码实现了进行草略分析后,发现NFQUEUE机制是支持network namespace的。POD虚拟网络设备的数据包送往用户态的队列是在POD独有的network namespace中创建的,和默认的network namespace:init_net中的队列是完全独立的。我们的用户态程序运行是在init_net中运行,而PODnetwork namespace中并没有用户态程序在读取队列获取数据包,因而数据包会被丢弃。

和之前文章同样,通过简化程序来进行实验。实验的Kubernetes环境有3个node, 容器组网使用flannel

我们创建了两个busyboxpod

1
2
kubectl run busybox1 --image=busybox --command -- sleep 3600
kubectl run busybox2 --image=busybox --command -- sleep 3600

他们分别位于node1node2上:

1
2
3
4
[root@master1 scripts]# kubectl get pods -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
busybox1-77bb94599d-x89z4 1/1 Running 12 22h 10.230.96.4 node1 <none> <none>
busybox2-7d76b658b6-h5r2k 1/1 Running 10 22h 10.230.12.2 node2 <none> <none>

我们从busybox2中访问busybox1,网络连通正常:

1
2
3
4
5
6
7
8
[root@master1 scripts]# kubectl exec -ti busybox2-7d76b658b6-h5r2k -- ping -c2 10.230.96.4
PING 10.230.96.4 (10.230.96.4): 56 data bytes
64 bytes from 10.230.96.4: seq=0 ttl=62 time=1.076 ms
64 bytes from 10.230.96.4: seq=1 ttl=62 time=0.770 ms

--- 10.230.96.4 ping statistics ---
2 packets transmitted, 2 packets received, 0% packet loss
round-trip min/avg/max = 0.770/0.923/1.076 ms

编写一个内核模块将数据包通过NFQUEUE队列送往用户态:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/inet.h>

MODULE_LICENSE("GPL");

static int queue_bypass = 0;
MODULE_PARM_DESC(clear_mark, "queue bypass on/off switch");
module_param(queue_bypass, int, 0600);

static unsigned int nf_hook_in1(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
const struct nf_hook_state *state)
{
unsigned int ret;
struct iphdr *iph;

if (skb->protocol != __constant_htons(ETH_P_IP)) {
return NF_ACCEPT;
}

iph = ip_hdr(skb);
if (iph && iph->protocol != IPPROTO_ICMP) {
return NF_ACCEPT;
}

printk(KERN_INFO "NSQUEUE: ICMP packet: %u.%u.%u.%u->%u.%u.%u.%u, " \
"init_net: %p, net: %p, devname: %s\n",
(ntohl(iph->saddr) >> 24) & 0xFF,
(ntohl(iph->saddr) >> 16) & 0xFF,
(ntohl(iph->saddr) >> 8) & 0xFF,
(ntohl(iph->saddr)) & 0xFF,
(ntohl(iph->daddr) >> 24) & 0xFF,
(ntohl(iph->daddr) >> 16) & 0xFF,
(ntohl(iph->daddr) >> 8) & 0xFF,
(ntohl(iph->daddr)) & 0xFF,
&init_net,
dev_net(skb->dev),
skb->dev->name
);

ret = NF_QUEUE_NR(80);
if (queue_bypass) {
ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
}

return ret;
}

static struct nf_hook_ops nfhooks[] = {
{
.hook = nf_hook_in1,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_FIRST,
},
};

int __init nsqueue_init(void)
{
nf_register_hooks(nfhooks, ARRAY_SIZE(nfhooks));

printk(KERN_INFO "NSQUEUE: module init\n");

return 0;
}

void __exit nsqueue_exit(void)
{
nf_unregister_hooks(nfhooks, ARRAY_SIZE(nfhooks));

printk(KERN_INFO "NSQUEUE: module exit\n");

return;
}

module_init(nsqueue_init);
module_exit(nsqueue_exit);

我们将ICMP的数据包通过NFQUEUE机制送往80号队列。

Makefile:

1
2
3
4
5
obj-m += nsqueue.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean

busybox1所在的node1上编译并装载nsqueue.ko:

1
insmod nsqueue.ko

然后我们再次尝试从busybox2访问busybox1, 访问超时, 网络确实断了:

1
2
3
4
5
6
[root@master1 scripts]# kubectl exec -ti busybox2-7d76b658b6-h5r2k -- ping -c2 10.230.96.4
PING 10.230.96.4 (10.230.96.4): 56 data bytes

--- 10.230.96.4 ping statistics ---
2 packets transmitted, 0 packets received, 100% packet loss
command terminated with exit code 1

查看node1dmesg信息, 可以看到nsqueue.ko处理了两个数据包:

1
2
3
[32555.762163] NSQUEUE: module init
[32634.888279] NSQUEUE: ICMP packet: 10.230.12.0->10.230.96.4, init_net: ffffffff85315bc0, net: ffff8ac077451480, devname: eth0
[32635.888896] NSQUEUE: ICMP packet: 10.230.12.0->10.230.96.4, init_net: ffffffff85315bc0, net: ffff8ac077451480, devname: eth0

接下来我们来在busybox1network namespace中运行用户态程序。

首先,通过kubectl获取container id:

1
2
[root@master1 scripts]# kubectl describe pod busybox1-77bb94599d-x89z4 |grep 'Container ID'
Container ID: containerd://4872c95767c2504f6d646b54e6843a30905d0dec27b9d5934d01f3301ac220e1

接着在node1上通过container id找到podpid:

1
2
3
4
[root@node1 vagrant]# crictl inspect 4872c95767c2504f6d646b54e6843a30905d0dec27b9d5934d01f3301ac220e1 |grep pid
"pid": 25207,
"pid": 1
"type": "pid"

然后在node1上通过nsenter进入busybox1network namespace:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
[root@node1 vagrant]# nsenter -n -t 25207
[root@node1 vagrant]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
3: eth0@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UP group default
link/ether c2:1a:09:a7:69:78 brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 10.230.96.4/24 brd 10.230.96.255 scope global eth0
valid_lft forever preferred_lft forever
inet6 fe80::c01a:9ff:fea7:6978/64 scope link
valid_lft forever preferred_lft forever

我们的用户态程序还是使用之前文章中的代码。在busybox1network namespace运行用户态程序a.out:

1
[root@node1 vagrant]# ./a.out

再次尝试pod间的访问, 可以看到又可以连通了:

1
2
3
4
5
6
7
8
[root@master1 scripts]# kubectl exec -ti busybox2-7d76b658b6-h5r2k -- ping -c2 10.230.96.4
PING 10.230.96.4 (10.230.96.4): 56 data bytes
64 bytes from 10.230.96.4: seq=0 ttl=62 time=8.954 ms
64 bytes from 10.230.96.4: seq=1 ttl=62 time=2.059 ms

--- 10.230.96.4 ping statistics ---
2 packets transmitted, 2 packets received, 0% packet loss
round-trip min/avg/max = 2.059/5.506/8.954 ms

查看a.out的输出,可以看到两个数据包被用户态程序处理:

1
2
3
[root@node1 vagrant]# ./a.out
packet: 1, origin mark: 0x00000000, set mark: 0x00000000
packet: 2, origin mark: 0x00000000, set mark: 0x00000000

我们的内核模块代码中带有一个queue_bypass参数,当参数被设置时,netfilterhook函数返回值会附带NF_VERDICT_FLAG_QUEUE_BYPASS标志。在附带这个标志时,内核协议栈会在队列不存在时跳到下一个hook函数而不是丢弃该数据包。

我们将queue_bypass参数修改为1:

1
echo 1 > /sys/module/nsqueue/parameters/queue_bypass

在不运行用户态程序的情况下再次尝试POD间访问,成功连通:

1
2
3
4
5
6
7
8
[root@master1 scripts]# kubectl exec -ti busybox2-7d76b658b6-h5r2k -- ping -c2 10.230.96.4
PING 10.230.96.4 (10.230.96.4): 56 data bytes
64 bytes from 10.230.96.4: seq=0 ttl=62 time=5.606 ms
64 bytes from 10.230.96.4: seq=1 ttl=62 time=3.708 ms

--- 10.230.96.4 ping statistics ---
2 packets transmitted, 2 packets received, 0% packet loss
round-trip min/avg/max = 3.708/4.657/5.606 ms