Skip to content

Commit

Permalink
Rework IP rules/routing for Kubernetes clustering
Browse files Browse the repository at this point in the history
Add IP rules for the Kubernetes Pod and Service CIDRs to ensure
that traffic within the Kubernetes network uses the main routing
table, which contains routes installed by the CNI.

These IP rules must have a higher priority than the per-port rules
for dom0-initiated connections. Otherwise, there could be scenarios
where packet sent to a Kubernetes service is DNAT-ed by kube-proxy
to a service backend located on another node, and this (dom0-initiated,
with port src IP) DNATed flow bypasses the VXLAN tunnel and is sent
out without encapsulation, thus getting lost/dropped.

Also, we do not need to have special marks for Kubernetes traffic
anymore. The iptables rules that prevent traffic from being forwarded
between device ports no longer depend on connection marking and
therefore we do not need to have connection marks for every traffic
allowed for forwarding.

Signed-off-by: Milan Lenco <[email protected]>
  • Loading branch information
milan-zededa authored and eriknordmark committed Oct 18, 2024
1 parent dd27fe1 commit eed46dc
Show file tree
Hide file tree
Showing 10 changed files with 241 additions and 399 deletions.
12 changes: 7 additions & 5 deletions pkg/pillar/devicenetwork/pbr.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@ const (
// (and persisted) for every network instance.
NIBaseRTIndex = 800

// PbrLocalDestPrio : IP rule priority for packets destined to locally owned addresses
PbrLocalDestPrio = 12000
// PbrLocalOrigPrio : IP rule priority for locally generated packets
PbrLocalOrigPrio = 15000

// PbrNatOutGatewayPrio : IP rule priority for packets destined to gateway(bridge ip) coming from apps.
PbrNatOutGatewayPrio = 9999
// PbrNatOutPrio : IP rule priority for packets destined to internet coming from apps
PbrNatOutPrio = 10000
// PbrNatInPrio : IP rule priority for external packets coming in towards apps
PbrNatInPrio = 11000
// PbrLocalDestPrio : IP rule priority for packets destined to locally owned addresses
PbrLocalDestPrio = 12000
// PbrKubeNetworkPrio : IP rule priority for traffic flowing through the Kubernetes
// network.
PbrKubeNetworkPrio = 13000
// PbrLocalOrigPrio : IP rule priority for locally (dom0) generated packets
PbrLocalOrigPrio = 15000
)
148 changes: 36 additions & 112 deletions pkg/pillar/dpcreconciler/linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"syscall"
"time"

"golang.org/x/sys/unix"

dg "github.com/lf-edge/eve-libs/depgraph"
"github.com/lf-edge/eve-libs/reconciler"
"github.com/lf-edge/eve/pkg/pillar/base"
Expand All @@ -40,10 +42,15 @@ import (
// | | NetworkIO | | Global | |
// | | | | | |
// | | +-----------+ +------------+ | | +-------------+ +-------------+ | |
// | | | NetIO | | NetIO | | | | ResolvConf | | LocalIPRule | | |
// | | | (external)| | (external) | ... | | | (singleton) | | (singleton) | | |
// | | | NetIO | | NetIO | | | | ResolvConf | | IPRule | | |
// | | | (external)| | (external) | ... | | | (singleton) | | (Local RT) | | |
// | | +-----------+ +------------+ | | +-------------+ +-------------+ | |
// | +--------------------------------------+ +------------------------------------+ |
// | +--------------------------------------+ | +-------------------+ | |
// | | | IPRule | ... | |
// | | | (for HV=kubevirt) | | |
// | | +-------------------+ | |
// | +------------------------------------+ |
// | |
// | |
// | +-----------------+ +------------------+ +-------------------------------------+ |
// | | PhysicalIfs | | LogicalIO (L2) | | Wireless | |
Expand All @@ -62,9 +69,9 @@ import (
// | | +-------------------------------+ | |
// | | | IPRules | | |
// | | +----------------------------------------+ | | | |
// | | | Adapters | | +---------+ +----------+ | | |
// | | | | | |SrcIPRule| |SrcIPRule | ... | | |
// | | | +---------+ +---------+ | | +---------+ +----------+ | | |
// | | | Adapters | | +-------+ +--------+ | | |
// | | | | | |IPRule | | IPRule | ... | | |
// | | | +---------+ +---------+ | | +-------+ +--------+ | | |
// | | | | Adapter | | Adapter | ... | +-------------------------------+ | |
// | | | +---------+ +---------+ | | |
// | | | +------------+ +------------+ | +-------------------------------+ | |
Expand Down Expand Up @@ -146,14 +153,11 @@ const (
intendedStateFile = "/run/nim-intended-state.dot"
)

const (
// Network bridge used by Kubernetes CNI.
// Currently, this is hardcoded for the Flannel CNI plugin.
kubeCNIBridge = "cni0"
var (
// CIDR used for IP allocation for K3s pods.
kubePodCIDR = "10.42.0.0/16"
_, kubePodCIDR, _ = net.ParseCIDR("10.42.0.0/16")
// CIDR used for IP allocation for K3s services.
kubeSvcCIDR = "10.43.0.0/16"
_, kubeSvcCIDR, _ = net.ParseCIDR("10.43.0.0/16")
)

// LinuxDpcReconciler is a DPC-reconciler for Linux network stack,
Expand Down Expand Up @@ -778,17 +782,6 @@ func (r *LinuxDpcReconciler) updateCurrentAdapterAddrs(
func (r *LinuxDpcReconciler) updateCurrentRoutes(dpc types.DevicePortConfig) (changed bool) {
sgPath := dg.NewSubGraphPath(L3SG, RoutesSG)
currentRoutes := dg.New(dg.InitArgs{Name: RoutesSG})
cniIfIndex := -1
if r.HVTypeKube {
ifIndex, found, err := r.NetworkMonitor.GetInterfaceIndex(kubeCNIBridge)
if err != nil {
r.Log.Errorf("getIntendedRoutes: failed to get ifIndex for %s: %v",
kubeCNIBridge, err)
}
if err == nil && found {
cniIfIndex = ifIndex
}
}
for _, port := range dpc.Ports {
if port.IfName == "" || port.InvalidConfig {
continue
Expand Down Expand Up @@ -823,29 +816,6 @@ func (r *LinuxDpcReconciler) updateCurrentRoutes(dpc types.DevicePortConfig) (ch
LastOperation: reconciler.OperationCreate,
})
}

if cniIfIndex != -1 {
cniRoutes, err := r.NetworkMonitor.ListRoutes(netmonitor.RouteFilters{
FilterByTable: true,
Table: table,
FilterByIf: true,
IfIndex: cniIfIndex,
})
if err != nil {
r.Log.Errorf("updateCurrentRoutes: ListRoutes failed for ifIndex %d: %v",
cniIfIndex, err)
}
for _, rt := range cniRoutes {
currentRoutes.PutItem(linux.Route{
Route: rt.Data.(netlink.Route),
UnmanagedLink: true,
}, &reconciler.ItemStateData{
State: reconciler.ItemStateCreated,
LastOperation: reconciler.OperationCreate,
})
}

}
}
prevSG := dg.GetSubGraph(r.currentState, sgPath)
if len(prevSG.DiffItems(currentRoutes)) > 0 {
Expand Down Expand Up @@ -877,7 +847,22 @@ func (r *LinuxDpcReconciler) getIntendedGlobalCfg(dpc types.DevicePortConfig) dg
}
intendedCfg := dg.New(graphArgs)
// Move IP rule that matches local destined packets below network instance rules.
intendedCfg.PutItem(linux.LocalIPRule{Priority: devicenetwork.PbrLocalDestPrio}, nil)
intendedCfg.PutItem(linux.IPRule{
Priority: devicenetwork.PbrLocalDestPrio,
Table: unix.RT_TABLE_LOCAL,
}, nil)
if r.HVTypeKube {
intendedCfg.PutItem(linux.IPRule{
Dst: kubePodCIDR,
Priority: devicenetwork.PbrKubeNetworkPrio,
Table: unix.RT_TABLE_MAIN,
}, nil)
intendedCfg.PutItem(linux.IPRule{
Dst: kubeSvcCIDR,
Priority: devicenetwork.PbrKubeNetworkPrio,
Table: unix.RT_TABLE_MAIN,
}, nil)
}
if len(dpc.Ports) == 0 {
return intendedCfg
}
Expand Down Expand Up @@ -1150,12 +1135,10 @@ func (r *LinuxDpcReconciler) getIntendedSrcIPRules(dpc types.DevicePortConfig) d
continue
}
for _, ipAddr := range ipAddrs {
intendedRules.PutItem(linux.SrcIPRule{
AdapterLL: port.Logicallabel,
AdapterIfName: port.IfName,
IPAddr: ipAddr.IP,
Priority: devicenetwork.PbrLocalOrigPrio,
Table: devicenetwork.DPCBaseRTIndex + ifIndex,
intendedRules.PutItem(linux.IPRule{
Src: netutils.HostSubnet(ipAddr.IP),
Priority: devicenetwork.PbrLocalOrigPrio,
Table: devicenetwork.DPCBaseRTIndex + ifIndex,
}, nil)
}
}
Expand All @@ -1170,26 +1153,6 @@ func (r *LinuxDpcReconciler) getIntendedRoutes(dpc types.DevicePortConfig) dg.Gr
intendedRoutes := dg.New(graphArgs)
// Routes are copied from the main table.
srcTable := syscall.RT_TABLE_MAIN
var cniRoutes []netmonitor.Route
if r.HVTypeKube {
ifIndex, found, err := r.NetworkMonitor.GetInterfaceIndex(kubeCNIBridge)
if err != nil {
r.Log.Errorf("getIntendedRoutes: failed to get ifIndex for %s: %v",
kubeCNIBridge, err)
}
if err == nil && found {
cniRoutes, err = r.NetworkMonitor.ListRoutes(netmonitor.RouteFilters{
FilterByTable: true,
Table: srcTable,
FilterByIf: true,
IfIndex: ifIndex,
})
if err != nil {
r.Log.Errorf("getIntendedRoutes: ListRoutes failed for ifIndex %d: %v",
ifIndex, err)
}
}
}
for _, port := range dpc.Ports {
if port.IfName == "" || port.InvalidConfig {
continue
Expand Down Expand Up @@ -1224,15 +1187,6 @@ func (r *LinuxDpcReconciler) getIntendedRoutes(dpc types.DevicePortConfig) dg.Gr
AdapterLL: port.Logicallabel,
}, nil)
}
for _, rt := range cniRoutes {
rtCopy := rt.Data.(netlink.Route)
rtCopy.Table = dstTable
r.prepareRouteForCopy(&rtCopy)
intendedRoutes.PutItem(linux.Route{
Route: rtCopy,
UnmanagedLink: true,
}, nil)
}
}
return intendedRoutes
}
Expand Down Expand Up @@ -1922,40 +1876,10 @@ func (r *LinuxDpcReconciler) getIntendedMarkingRules(dpc types.DevicePortConfig,
TargetOpts: []string{"--set-mark", controlProtoMark("in_dhcp")},
Description: "Mark ingress DHCP traffic",
}
// Mark all traffic from Kubernetes pods to Kubernetes services.
// Note that traffic originating from another node is already D-NATed
// and will get marked with the kube_pod mark.
markKubeSvc := iptables.Rule{
RuleLabel: "Kubernetes service mark",
MatchOpts: []string{"-i", kubeCNIBridge, "-s", kubePodCIDR, "-d", kubeSvcCIDR},
Target: "CONNMARK",
TargetOpts: []string{"--set-mark", controlProtoMark("kube_svc")},
Description: "Mark traffic from Kubernetes pods to Kubernetes services",
}
// Mark all traffic forwarded between Kubernetes pods.
markKubePod := iptables.Rule{
RuleLabel: "Kubernetes pod mark",
MatchOpts: []string{"-s", kubePodCIDR, "-d", kubePodCIDR},
Target: "CONNMARK",
TargetOpts: []string{"--set-mark", controlProtoMark("kube_pod")},
Description: "Mark all traffic directly forwarded between Kubernetes pods",
}
// Mark all DNS requests made from the Kubernetes network.
markKubeDNS := iptables.Rule{
RuleLabel: "Kubernetes DNS mark",
MatchOpts: []string{"-s", kubePodCIDR, "-p", "udp", "--dport", "domain"},
Target: "CONNMARK",
TargetOpts: []string{"--set-mark", controlProtoMark("kube_dns")},
AppliedBefore: []string{markKubeSvc.RuleLabel, markKubePod.RuleLabel},
Description: "Mark DNS requests made from the Kubernetes network",
}

protoMarkV4Rules := []iptables.Rule{
markSSHAndGuacamole, markVnc, markIcmpV4, markDhcp,
}
if r.HVTypeKube {
protoMarkV4Rules = append(protoMarkV4Rules, markKubeDNS, markKubeSvc, markKubePod)
}
protoMarkV6Rules := []iptables.Rule{
markSSHAndGuacamole, markVnc, markIcmpV6,
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/pillar/dpcreconciler/linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func TestReconcileWithEmptyArgs(test *testing.T) {
t.Expect(status.RS.ConfigError).To(BeEmpty())
t.Expect(status.DNS.Error).To(BeNil())
t.Expect(status.DNS.Servers).To(BeEmpty())
t.Expect(itemCountWithType(linux.LocalIPRuleTypename)).To(Equal(1))
t.Expect(itemCountWithType(linux.IPRuleTypename)).To(Equal(1))
t.Expect(itemCountWithType(iptables.ChainV4Typename)).To(Equal(14))
t.Expect(itemCountWithType(iptables.ChainV6Typename)).To(Equal(14))
t.Expect(itemCountWithType(iptables.RuleV4Typename)).To(Equal(22))
Expand Down Expand Up @@ -311,7 +311,7 @@ func TestSingleEthInterface(test *testing.T) {
t.Expect(status.DNS.Servers["eth0"]).To(HaveLen(1))
t.Expect(status.DNS.Servers["eth0"][0].String()).To(Equal("8.8.8.8"))
t.Expect(itemDescription(adapterAddrs)).To(Equal("Adapter mock-eth0 IP addresses: [192.168.10.5/24]"))
t.Expect(itemIsCreatedWithLabel("IP rule for mock-eth0/192.168.10.5")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("15000: from 192.168.10.5/32 to all lookup 501")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("IPv4 route table 501 dst <default> dev mock-eth0 via 192.168.10.1")).To(BeTrue())
t.Expect(itemIsCreated(resolvConf)).To(BeTrue())
t.Expect(itemDescription(resolvConf)).To(ContainSubstring("eth0: [8.8.8.8]"))
Expand Down Expand Up @@ -537,8 +537,8 @@ func TestMultipleEthsSameSubnet(test *testing.T) {
t.Expect(itemDescription(eth0AdapterAddrs)).To(Equal("Adapter mock-eth0 IP addresses: [192.168.10.5/24]"))
eth1AdapterAddrs := dg.Reference(generic.AdapterAddrs{AdapterIfName: "eth1"})
t.Expect(itemDescription(eth1AdapterAddrs)).To(Equal("Adapter mock-eth1 IP addresses: [192.168.10.6/24]"))
t.Expect(itemIsCreatedWithLabel("IP rule for mock-eth0/192.168.10.5")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("IP rule for mock-eth1/192.168.10.6")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("15000: from 192.168.10.5/32 to all lookup 501")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("15000: from 192.168.10.6/32 to all lookup 502")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("IPv4 route table 501 dst <default> dev mock-eth0 via 192.168.10.1")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("IPv4 route table 502 dst <default> dev mock-eth1 via 192.168.10.1")).To(BeTrue())
t.Expect(itemIsCreatedWithLabel("ARP entry 192.168.10.6 / 02:00:00:00:00:02 for mock-eth0")).To(BeTrue())
Expand Down
Loading

0 comments on commit eed46dc

Please sign in to comment.