// Some design notes and history: // VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts) // The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses // - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates // an L2 miss (i.e. an ARP lookup) // - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use. // This is stored in the ARP table (with a timeout) to avoid constantly looking it up. // - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from // the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called // an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route // is created to the whole flannel network so that non-local traffic is sent over the vxlan device. // // In this scheme the scaling of table entries (per host) is: // - 1 route (for the configured network out the vxlan device) // - One arp entry for each remote container that this host has recently contacted // - One FDB entry for each remote host // // The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either // during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required. // // // The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not // listening for any netlink messages anymore. This improves reliability (no problems with timeouts if // flannel crashes or restarts) and simplifies upgrades. // // How it works: // Create the vxlan device but don't register for any L2MISS or L3MISS messages // Then, as each remote host is discovered (either on startup or when they are added), do the following // 1) create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host). // 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC) // 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon. // // In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host // // In this newest scheme, there is also the option of skipping the use of vxlan for hosts that are on the same subnet, // this is called "directRouting"
[root@master1 ~]# kubectl exec -it busybox2-6f8fdb784d-r6ln2 -- ip route default via 10.230.41.1 dev eth0 10.230.0.0/16 via 10.230.41.1 dev eth0 10.230.41.0/24 dev eth0 scope link src 10.230.41.17
[root@node1 ~]# ip addr show dev cni0 5: cni0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UP group default qlen 1000 link/ether 86:99:b6:37:95:b2 brd ff:ff:ff:ff:ff:ff inet 10.230.41.1/24 brd 10.230.41.255 scope global cni0 valid_lft forever preferred_lft forever inet6 fe80::8499:b6ff:fe37:95b2/64 scope link valid_lft forever preferred_lft forever
[root@node1 ~]# ip route default via 10.0.2.2 dev eth0 10.0.2.0/24 dev eth0 proto kernel scope link src 10.0.2.15 10.230.41.0/24 dev cni0 proto kernel scope link src 10.230.41.1 10.230.93.0/24 via 10.230.93.0 dev flannel.1 onlink 10.240.0.0/24 dev eth1 proto kernel scope link src 10.240.0.101 169.254.0.0/16 dev eth0 scope link metric 1002 169.254.0.0/16 dev eth1 scope link metric 1003
[root@node2 ~]# ip addr show flannel.1 4: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UNKNOWN group default link/ether 2a:02:24:58:e9:07 brd ff:ff:ff:ff:ff:ff inet 10.230.93.0/32 scope global flannel.1 valid_lft forever preferred_lft forever inet6 fe80::2802:24ff:fe58:e907/64 scope link valid_lft forever preferred_lft forever
flannel.1解包之后,根据内层目的地址:10.240.93.2查找路由转发到cni0:
1 2 3 4 5 6
[root@node2 ~]# ip route default via 10.0.2.2 dev eth0 proto dhcp metric 100 10.0.2.0/24 dev eth0 proto kernel scope link src 10.0.2.15 metric 100 10.230.41.0/24 via 10.230.41.0 dev flannel.1 onlink 10.230.93.0/24 dev cni0 proto kernel scope link src 10.230.93.1 10.240.0.0/24 dev eth1 proto kernel scope link src 10.240.0.102 metric 101
// Create a backend manager then use it to create the backend and register the network with it. bm := backend.NewManager(ctx, sm, extIface) be, err := bm.GetBackend(config.BackendType) if err != nil { log.Errorf("Error fetching backend: %s", err) cancel() wg.Wait() os.Exit(1) }
// Ensure that the device has a /32 address so that no broadcast routes are created. // This IP is just used as a source address for host to workload traffic (so // the return path for the traffic has an address on the flannel network to use as the destination) if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}); err != nil { returnnil, fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err) }
if err := WriteSubnetFile(opts.subnetFile, config.Network, opts.ipMasq, bn); err != nil { // Continue, even though it failed. log.Warningf("Failed to write subnet file: %s", err) } else { log.Infof("Wrote subnet file to %s", opts.subnetFile) }
接着,main函数中启动一个goroutine去运行bn.Run:
1 2 3 4 5 6 7
// Start "Running" the backend network. This will block until the context is done so run in another goroutine. log.Info("Running backend.") wg.Add(1) gofunc() { bn.Run(ctx) wg.Done() }()
// This route is used when traffic should be vxlan encapsulated vxlanRoute := netlink.Route{ LinkIndex: nw.dev.link.Attrs().Index, Scope: netlink.SCOPE_UNIVERSE, Dst: sn.ToIPNet(), Gw: sn.IP.ToIP(), } vxlanRoute.SetFlag(syscall.RTNH_F_ONLINK)
// directRouting is where the remote host is on the same subnet so vxlan isn't required. directRoute := netlink.Route{ Dst: sn.ToIPNet(), Gw: attrs.PublicIP.ToIP(), } var directRoutingOK = false if nw.dev.directRouting { routes, err := netlink.RouteGet(attrs.PublicIP.ToIP()) if err != nil { log.Errorf("Couldn't lookup route to %v: %v", attrs.PublicIP, err) continue } iflen(routes) == 1 && routes[0].Gw == nil { // There is only a single route and there's no gateway (i.e. it's directly connected) directRoutingOK = true } }
switch event.Type { case subnet.EventAdded: if directRoutingOK { log.V(2).Infof("Adding direct route to subnet: %s PublicIP: %s", sn, attrs.PublicIP)
// Try to clean up the ARP entry then continue if err := nw.dev.DelARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil { log.Error("DelARP failed: ", err) }
continue }
// Set the route - the kernel would ARP for the Gw IP address if it hadn't already been set above so make sure // this is done last. if err := netlink.RouteReplace(&vxlanRoute); err != nil { log.Errorf("failed to add vxlanRoute (%s -> %s): %v", vxlanRoute.Dst, vxlanRoute.Gw, err)
// Try to clean up both the ARP and FDB entries then continue if err := nw.dev.DelARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil { log.Error("DelARP failed: ", err) }
continue } } case subnet.EventRemoved: if directRoutingOK { log.V(2).Infof("Removing direct route to subnet: %s PublicIP: %s", sn, attrs.PublicIP) if err := netlink.RouteDel(&directRoute); err != nil { log.Errorf("Error deleting route to %v via %v: %v", sn, attrs.PublicIP, err) } } else { log.V(2).Infof("removing subnet: %s PublicIP: %s VtepMAC: %s", sn, attrs.PublicIP, net.HardwareAddr(vxlanAttrs.VtepMAC))
// Try to remove all entries - don't bail out if one of them fails. if err := nw.dev.DelARP(neighbor{IP: sn.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil { log.Error("DelARP failed: ", err) }