问题
[root@master busybox]# kubectl get pod -nkube-system -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
coredns-5c98db65d4-8zjps 1/1 Running 1 2d23h 10.244.0.13 master <none> <none>
coredns-5c98db65d4-d2kth 1/1 Running 1 2d23h 10.244.0.14 master <none> <none>
[root@master busybox]# kubectl get pod -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
curl-6bf6db5c4f-pjld9 1/1 Running 1 3d 10.244.1.2 node2 <none> <none>
gateway-99b655cc6-np685 1/1 Running 0 44s 10.244.0.54 master <none> <none>
test-post-start1 1/1 Running 0 115s 10.244.1.6 node2 <none> <none>
test-post-start2 1/1 Running 0 115s 10.244.0.52 master <none> <none>
test-post-start3 1/1 Running 0 115s 10.244.0.53 master <none> <none>
如上所示,出现部署在master
节点上的pod
,无法解析gateway.default.svc.cluster.local
域名,但是部署在node2
,确可以解析,如上curl-6bf6db5c4f-pjld9
,test-post-start1
通过nslookup
都可以解析.
# 报错
/ # nslookup gateway
nslookup: can't resolve '(null)': Name does not resolve
nslookup: can't resolve 'gateway': Try again
/ # nslookup gateway.default.svc.cluster.local
Server: 10.244.0.10
Address 1: 10.244.0.10
nslookup: can't resolve 'gateway.default.svc.cluster.local'
分析
进入master
节点pod
,直接通过coredns pod ip
解析测试
kubectl exec -it test-post-start2 sh
/ # nslookup gateway.default.svc.cluster.local 10.244.0.13
Server: 10.244.0.13
Address 1: 10.244.0.13 10-244-0-13.kube-dns.kube-system.svc.cluster.local
Name: gateway.default.svc.cluster.local
Address 1: 10.244.106.29 gateway.default.svc.cluster.local
发现直接通过coredns pod ip
解析可以成功,证明coredns
服务本身没有问题.
查看dns clusterIP
.
[root@master ~]# kubectl get svc -nkube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kube-dns ClusterIP 10.244.0.10 <none> 53/UDP,53/TCP,9153/TCP 21m
# 通过clusterIP解析域名失败
nslookup gateway.default.svc.cluster.local 10.244.0.10
通过以上测试证明问题出现在coredns service
上.
解决
导出现有kube-dns service
配置
kubectl get svc -nkube-system kube-dns -oyaml > kube-dns-svc.yaml
修改kube-dns-svc.yaml
.
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/port: "9153"
prometheus.io/scrape: "true"
labels:
k8s-app: kube-dns
kubernetes.io/cluster-service: "true"
kubernetes.io/name: KubeDNS
name: kube-dns
namespace: kube-system
spec:
ports:
- name: dns
port: 53
protocol: UDP
targetPort: 53
- name: dns-tcp
port: 53
protocol: TCP
targetPort: 53
- name: metrics
port: 9153
protocol: TCP
targetPort: 9153
selector:
k8s-app: kube-dns
sessionAffinity: None
type: ClusterIP
kubectl apply -f kube-dns-svc.yaml
查看最新的coredns clusterIP
,当前为10.244.47.231
.
[root@master ~]# kubectl get svc -nkube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kube-dns ClusterIP 10.244.47.231 <none> 53/UDP,53/TCP,9153/TCP 21m
进去之前无法解析的pod
中测试,证明新的clusterIP
没有问题.
nslookup gateway.default.svc.cluster.local 10.244.47.231
修改kubelet --clusterDNS
,这样新创建的pod /etc/resolv.conf
中nameserver
为新的coredns clusterIP
.
# 修改kubelet配置
vim /var/lib/kubelet/config.yaml
# 找到clusterDNS
clusterDNS:
- 10.244.47.231
# 重启kubelet生效,注意k8s中所有节点都需要修改重启
systemctl restart kubelet.service
最后测试,新的pod
中/etc/resolv.conf
.解析没有问题.
/ # cat /etc/resolv.conf
nameserver 10.244.47.231
search default.svc.cluster.local svc.cluster.local cluster.local
options ndots:5