Keep learning, keep living...

0%

Kubernetes环境中NFQUEUE与MARK机制冲突

在Kubernetes节点上安装我们的流量检测模块之后所有的Pod会断网。经分析是由于流量检测模块的NFQUEUE机制与kube-proxy使用的iptablesmark机制冲突的原因。

在Linux内核中,网络数据包是由sk_buff结构来表示的,一般数据包简写作SKBmarksk_buff结构的一个字段, 如(include/linux/skbuff.h):

1
2
3
4
5
6
7
8
struct sk_buff {
...
union {
__u32 mark;
__u32 reserved_tailroom;
};
...
}

mark并不是网络协议结构的部分,不会存在于任一层协议头中,而是Linux网络子系统用于在主机内部传递状态信息的标记机制。各种网络应用可以根据自身需要使用该字段来实现自身的状态传递。

这个mark机制主要用在netfilter框架中,所以也叫nfmark。除了它之外,内核中还有conntrack模块也有自己的mark机制,一般叫做ctmark

之前的文章<<基于IPTABLES MARK机制实现策略路由>>也介绍过iptablesMARK模块,可以用于修改和匹配数据包的mark值。

NFQUEUE机制可以在内核中将数据通过NFQUEUE通道将数据包送往用户态,在用户态进行安全检测,再将裁决(verdict)结果送回内核。之前的文章<<NFQUEUE和libnetfilter_queue实例分析>>介绍了libnetfilter_queue库的简单用法。我们的流量检测程序会使用libnetfilter_queue库中的nfq_set_verdict2在返回verdict的同时,设置数据包的mark值,以传递更多的信息给内核模块,函数原型如下:

1
2
3
4
5
6
7
int nfq_set_verdict2(struct nfq_q_handle *  qh,
uint32_t id,
uint32_t verdict,
uint32_t mark,
uint32_t data_len,
const unsigned char* buf
)

这就会导致数据包sk_buff结构的mark值被设置。而kube-proxy实现也依赖iptablesmark机制, 会在主机上添加如下iptables规则:

1
2
3
-A KUBE-MARK-DROP -j MARK --set-xmark 0x8000/0x8000
...
-A KUBE-FIREWALL -m comment --comment "kubernetes firewall for dropping marked packets" -m mark --mark 0x8000/0x8000 -j DROP

对于不合法的报文,kube-proxy会给相应报文标记0x8000/0x8000, 之后通过KUBE-FIREWALL规则链将数据包丢弃。

如果我们的流量检测程序所设置的mark值设置为kube-proxy所依赖的0x8000位,就会导致数据包被丢弃。

下面通过简化后的程序来验证。

首先,编写一个内核模块将数据包通过NFQUEUE队列送往用户态:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/tcp.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/inet.h>
#include <net/checksum.h>

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("skpid");
MODULE_ALIAS("module skpid netfiler");

static int clear_mark = 0;
MODULE_PARM_DESC(clear_mark, "clear sk_buff mark on/off switch");
module_param(clear_mark, int, 0600);

static unsigned int nf_hook_in1(unsigned int hooknum,
struct sk_buff *sk,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct tcphdr *tcph = NULL;
struct iphdr *iph = ip_hdr(sk);
unsigned long flags;

u8 proto = iph->protocol;

if (proto != IPPROTO_TCP) {
return NF_ACCEPT;
}

tcph = (struct tcphdr *) skb_transport_header(sk);

if (htons(tcph->dest) == 80) {
printk(KERN_INFO "[1]: %d->%d mark: 0x%08x queued in [80]\n", htons(tcph->source), htons(tcph->dest), sk->mark);
return NF_QUEUE_NR(80);
}

return NF_ACCEPT;
}

static unsigned int nf_hook_in2(unsigned int hooknum,
struct sk_buff *sk,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct tcphdr *tcph = NULL;
struct iphdr *iph = ip_hdr(sk);
unsigned long flags;

u8 proto = iph->protocol;

if (proto != IPPROTO_TCP) {
return NF_ACCEPT;
}

tcph = (struct tcphdr *) skb_transport_header(sk);

if (htons(tcph->dest) == 80) {
printk(KERN_INFO "[2]: %d->%d mark: 0x%08x\n", htons(tcph->source), htons(tcph->dest), sk->mark);

/*************************************************
* clear the mark value
*************************************************/
if (clear_mark) {
printk(KERN_INFO "[2]: %d->%d mark cleared\n", htons(tcph->source), htons(tcph->dest), sk->mark);
sk->mark = 0;
}

return NF_ACCEPT;
}

return NF_ACCEPT;
}

static struct nf_hook_ops nfhooks[] = {
{
.hook = nf_hook_in1,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_FIRST,
},
{
.hook = nf_hook_in2,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_FIRST + 1,
},
};


int __init skpid_init(void)
{
nf_register_hooks(nfhooks, ARRAY_SIZE(nfhooks));

printk("skpid module init\n");

return 0;
}

void __exit skpid_exit(void)
{
nf_unregister_hooks(nfhooks, ARRAY_SIZE(nfhooks));

printk("skpid module exit\n");

return;
}

module_init(skpid_init);
module_exit(skpid_exit);

Makefile内容如下:

1
2
3
4
5
obj-m += skpid.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean

我们在netfilter框架的LOCAL_IN位置注册了两个函数,第一个函数优先级为: NF_IP_PRI_FIRST, 第二个函数优先级为NF_IP_PRI_FIRST+1netfilterhook函数的调用顺序为升序。而iptables各表的调用优先级如下(include/uapi/linux/netfilter_ipv4.h):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
enum nf_ip_hook_priorities {
NF_IP_PRI_FIRST = INT_MIN,
NF_IP_PRI_CONNTRACK_DEFRAG = -400,
NF_IP_PRI_RAW = -300,
NF_IP_PRI_SELINUX_FIRST = -225,
NF_IP_PRI_CONNTRACK = -200,
NF_IP_PRI_MANGLE = -150,
NF_IP_PRI_NAT_DST = -100,
NF_IP_PRI_FILTER = 0,
NF_IP_PRI_SECURITY = 50,
NF_IP_PRI_NAT_SRC = 100,
NF_IP_PRI_SELINUX_LAST = 225,
NF_IP_PRI_CONNTRACK_HELPER = 300,
NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
NF_IP_PRI_LAST = INT_MAX,
};

因而我们的两个函数将先于iptables被执行。nf_hook_in1将访问本机TCP80端口的数据包通过队列80送往用户态。nf_hook_in2会在用户态返回裁决结果后被执行,在这里展示用户态所设置的mark值。

接着编写用户态程序, 从之前的文章<<NFQUEUE和libnetfilter_queue实例分析>>简单修改而来,t.c代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <netinet/in.h>
#include <linux/types.h>
#include <linux/netfilter.h>
#include <libnetfilter_queue/libnetfilter_queue.h>

uint32_t mark;

static int cb(struct nfq_q_handle *qh, struct nfgenmsg *nfmsg,
struct nfq_data *nfa, void *data)
{
u_int32_t id = 0;
struct nfqnl_msg_packet_hdr *ph;
uint32_t m = 0;

ph = nfq_get_msg_packet_hdr(nfa);
if (ph) {
id = ntohl(ph->packet_id);
}

m = nfq_get_nfmark(nfa);

printf("packet: %u, origin mark: 0x%08x, set mark: 0x%08x\n", id, m, mark);

return nfq_set_verdict2(qh, id, NF_ACCEPT, mark, 0, NULL);
}

int main(int argc, char **argv)
{
struct nfq_handle *h;
struct nfq_q_handle *qh;
struct nfnl_handle *nh;
int fd;
int rv;
char buf[4096];

if (argc > 1 && strcmp(argv[1], "block") == 0) {
mark = 0x8000;
} else {
mark = 0x0;
}

assert((h = nfq_open()) != NULL);
assert(nfq_unbind_pf(h, AF_INET) == 0);
assert(nfq_bind_pf(h, AF_INET) == 0);

assert((qh = nfq_create_queue(h, 80, &cb, NULL)) != NULL);
assert(nfq_set_mode(qh, NFQNL_COPY_PACKET, 0xffff) == 0);

fd = nfq_fd(h);

while ((rv = recv(fd, buf, sizeof(buf), 0)) && rv >= 0) {
nfq_handle_packet(h, buf, rv);
}

nfq_destroy_queue(qh);

nfq_close(h);
return 0;
}

程序从NFQUEUE队列80中读取数据包,然后返回裁决结果。如果程序执行时带有block参数,则设置mark值为0x8000, 否则设置为0x0

编译用户态程序:

1
gcc t.c  -l netfilter_queue

程序编译完成后,准备实验环境。首先清空INPUT链:

1
iptables -F INPUT

然后手动模拟kube-proxy所需要的KUBE-FIREWALL规则:

1
2
3
iptables -t filter -N KUBE-FIREWALL
iptables -A KUBE-FIREWALL -m comment --comment "kubernetes firewall for droping marked packets" -m mark --mark 0x8000/0x8000 -j DROP
iptables -A INPUT -p tcp --dport 80 -j KUBE-FIREWALL

访问本机TCP80端口的数据包,如果mark值匹配0x8000/0x8000将被丢弃。

添加完成后,iptables规则如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[root@centos3 vagrant]# iptables -nL
Chain INPUT (policy ACCEPT)
target prot opt source destination
KUBE-FIREWALL tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:80

Chain FORWARD (policy ACCEPT)
target prot opt source destination
REJECT all -- 0.0.0.0/0 0.0.0.0/0 reject-with icmp-host-prohibited

Chain OUTPUT (policy ACCEPT)
target prot opt source destination

Chain KUBE-FIREWALL (1 references)
target prot opt source destination
DROP all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes firewall for droping marked packets */ mark match 0x8000/0x8000

实验环境在80端口运行nginx, 此时能够正常访问80端口:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
[root@centos3 vagrant]# curl -Iv http://127.0.0.1
* About to connect() to 127.0.0.1 port 80 (#0)
* Trying 127.0.0.1...
* Connected to 127.0.0.1 (127.0.0.1) port 80 (#0)
> HEAD / HTTP/1.1
> User-Agent: curl/7.29.0
> Host: 127.0.0.1
> Accept: */*
>
< HTTP/1.1 200 OK
HTTP/1.1 200 OK
< Server: nginx/1.19.8
Server: nginx/1.19.8
< Date: Sun, 05 Sep 2021 12:43:15 GMT
Date: Sun, 05 Sep 2021 12:43:15 GMT
< Content-Type: text/html
Content-Type: text/html
< Content-Length: 612
Content-Length: 612
< Last-Modified: Mon, 26 Apr 2021 12:02:59 GMT
Last-Modified: Mon, 26 Apr 2021 12:02:59 GMT
< Connection: keep-alive
Connection: keep-alive
< ETag: "6086abf3-264"
ETag: "6086abf3-264"
< Accept-Ranges: bytes
Accept-Ranges: bytes

<
* Connection #0 to host 127.0.0.1 left intact

加载内核模块:

1
insmod ./skpid.ko

这时再次去访问80端口, 无法连接成功:

1
2
3
[root@centos3 vagrant]# curl -Iv http://127.0.0.1
* About to connect() to 127.0.0.1 port 80 (#0)
* Trying 127.0.0.1...

查看/var/log/messages文件, 可以看到从用户态程序返回后,数据包mark值被修改为’0x8000`:

1
2
3
4
5
Sep  5 12:49:58 centos3 kernel: [1]: 40240->80 mark: 0x00000000 queued in [80]
Sep 5 12:49:58 centos3 kernel: [2]: 40240->80 mark: 0x00008000
Sep 5 12:49:59 centos3 kernel: [1]: 40240->80 mark: 0x00000000 queued in [80]
Sep 5 12:49:59 centos3 kernel: [2]: 40240->80 mark: 0x00008000
...

此时用户态程序输出:

1
2
3
4
[root@centos3 c]# ./a.out block
packet: 1, origin mark: 0x00000000, set mark: 0x00008000
packet: 2, origin mark: 0x00000000, set mark: 0x00008000
...

我们重新运行用户态程序, 不再设置mark0x8000

1
./a.out

再次访问80端口, 可以访问成功:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
[root@centos3 vagrant]# curl -Iv http://127.0.0.1
* About to connect() to 127.0.0.1 port 80 (#0)
* Trying 127.0.0.1...
* Connected to 127.0.0.1 (127.0.0.1) port 80 (#0)
> HEAD / HTTP/1.1
> User-Agent: curl/7.29.0
> Host: 127.0.0.1
> Accept: */*
>
< HTTP/1.1 200 OK
HTTP/1.1 200 OK
< Server: nginx/1.19.8
Server: nginx/1.19.8
< Date: Sun, 05 Sep 2021 12:57:25 GMT
Date: Sun, 05 Sep 2021 12:57:25 GMT
< Content-Type: text/html
Content-Type: text/html
< Content-Length: 612
Content-Length: 612
< Last-Modified: Mon, 26 Apr 2021 12:02:59 GMT
Last-Modified: Mon, 26 Apr 2021 12:02:59 GMT
< Connection: keep-alive
Connection: keep-alive
< ETag: "6086abf3-264"
ETag: "6086abf3-264"
< Accept-Ranges: bytes
Accept-Ranges: bytes

<
* Connection #0 to host 127.0.0.1 left intact

查看/var/log/messages可以看到, 用户态程序设置的数据包mark0x0:

1
2
3
4
5
Sep  5 12:57:25 centos3 kernel: [1]: 40244->80 mark: 0x00000000 queued in [80]
Sep 5 12:57:25 centos3 kernel: [2]: 40244->80 mark: 0x00000000
Sep 5 12:57:25 centos3 kernel: [1]: 40244->80 mark: 0x00000000 queued in [80]
Sep 5 12:57:25 centos3 kernel: [2]: 40244->80 mark: 0x00000000
...

前边提到,我们的两个hook函数会先于iptables执行。因此,只要我们的第二个hook函数将sk_buffmark清零,就不会再与iptables规则冲突。

下面来尝试。我们再把用户态程序带block参数执行:

1
./a.out block

再次访问80端口,会再次访问超时。

我们通过修改内核模块的参数clear_mark1, 这会让nf_hook_in2执行时清零mark值:

1
echo 1 > /sys/module/skpid/parameters/clear_mark

再次访问80端口,连接成功:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
[root@centos3 vagrant]# curl -Iv http://127.0.0.1
* About to connect() to 127.0.0.1 port 80 (#0)
* Trying 127.0.0.1...
* Connected to 127.0.0.1 (127.0.0.1) port 80 (#0)
> HEAD / HTTP/1.1
> User-Agent: curl/7.29.0
> Host: 127.0.0.1
> Accept: */*
>
< HTTP/1.1 200 OK
HTTP/1.1 200 OK
< Server: nginx/1.19.8
Server: nginx/1.19.8
< Date: Sun, 05 Sep 2021 13:07:11 GMT
Date: Sun, 05 Sep 2021 13:07:11 GMT
< Content-Type: text/html
Content-Type: text/html
< Content-Length: 612
Content-Length: 612
< Last-Modified: Mon, 26 Apr 2021 12:02:59 GMT
Last-Modified: Mon, 26 Apr 2021 12:02:59 GMT
< Connection: keep-alive
Connection: keep-alive
< ETag: "6086abf3-264"
ETag: "6086abf3-264"
< Accept-Ranges: bytes
Accept-Ranges: bytes

<
* Connection #0 to host 127.0.0.1 left intact

查看/var/log/messages可以看到, 数据包的mark被清零:

1
2
3
4
5
6
7
Sep  5 13:07:11 centos3 kernel: [1]: 40248->80 mark: 0x00000000 queued in [80]
Sep 5 13:07:11 centos3 kernel: [2]: 40248->80 mark: 0x00008000
Sep 5 13:07:11 centos3 kernel: [2]: 40248->80 mark cleared
Sep 5 13:07:11 centos3 kernel: [1]: 40248->80 mark: 0x00000000 queued in [80]
Sep 5 13:07:11 centos3 kernel: [2]: 40248->80 mark: 0x00008000
Sep 5 13:07:11 centos3 kernel: [2]: 40248->80 mark cleared
...

如果只是验证NFQUEUE机制用户态设置的markkube-proxymark冲突,可以不自己编写内核模块,直接使用iptablesNFQUEUE目标传递数据包。只是需要注意,iptables规则链的匹配逻辑是:

  • 按顺序进行检查,匹配到规则就停止,不再匹配后续规则(LOG目标除外)
  • 若找不到匹配的规则,按该链的默认策略处理

因而不能在filter表的INPUT规则链中添加NFQUEUE规则,那样将只有一条规则被匹配执行。我们可以在mangleINPUT链来添加,这样就两条规则都会执行:

1
iptables -t mangle -A INPUT -p tcp --dport 80 -j NFQUEUE --queue-num 80

本文简单的说明了NFQUEUE的用户态程序设置mark可能与kube-proxy自身的mark机制冲突。后续再详细分析NFQUEUE具体实现中如何修改相应数据包的mark

参考文档: