0%

k8s【coredns解析问题】

问题

[root@master busybox]# kubectl get pod -nkube-system -owide
NAME                             READY   STATUS    RESTARTS   AGE     IP               NODE     NOMINATED NODE   READINESS GATES
coredns-5c98db65d4-8zjps         1/1     Running   1          2d23h   10.244.0.13      master   <none>           <none>
coredns-5c98db65d4-d2kth         1/1     Running   1          2d23h   10.244.0.14      master   <none>           <none>
[root@master busybox]# kubectl get pod -owide
NAME                      READY   STATUS    RESTARTS   AGE    IP            NODE     NOMINATED NODE   READINESS GATES
curl-6bf6db5c4f-pjld9     1/1     Running   1          3d     10.244.1.2    node2    <none>           <none>
gateway-99b655cc6-np685   1/1     Running   0          44s    10.244.0.54   master   <none>           <none>
test-post-start1          1/1     Running   0          115s   10.244.1.6    node2    <none>           <none>
test-post-start2          1/1     Running   0          115s   10.244.0.52   master   <none>           <none>
test-post-start3          1/1     Running   0          115s   10.244.0.53   master   <none>           <none>

如上所示,出现部署在master节点上的pod,无法解析gateway.default.svc.cluster.local域名,但是部署在node2,确可以解析,如上curl-6bf6db5c4f-pjld9,test-post-start1通过nslookup都可以解析.

# 报错
/ # nslookup gateway
nslookup: can't resolve '(null)': Name does not resolve

nslookup: can't resolve 'gateway': Try again

/ # nslookup gateway.default.svc.cluster.local
Server:    10.244.0.10
Address 1: 10.244.0.10

nslookup: can't resolve 'gateway.default.svc.cluster.local'

分析

进入master节点pod,直接通过coredns pod ip解析测试

kubectl exec -it test-post-start2 sh
/ # nslookup gateway.default.svc.cluster.local 10.244.0.13
Server:    10.244.0.13
Address 1: 10.244.0.13 10-244-0-13.kube-dns.kube-system.svc.cluster.local

Name:      gateway.default.svc.cluster.local
Address 1: 10.244.106.29 gateway.default.svc.cluster.local

发现直接通过coredns pod ip解析可以成功,证明coredns服务本身没有问题.

查看dns clusterIP.

[root@master ~]# kubectl get svc -nkube-system
NAME            TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                  AGE
kube-dns        ClusterIP   10.244.0.10    <none>        53/UDP,53/TCP,9153/TCP   21m
# 通过clusterIP解析域名失败
nslookup gateway.default.svc.cluster.local 10.244.0.10

通过以上测试证明问题出现在coredns service上.

解决

导出现有kube-dns service配置

kubectl get svc -nkube-system kube-dns -oyaml > kube-dns-svc.yaml

修改kube-dns-svc.yaml.

apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/port: "9153"
    prometheus.io/scrape: "true"
  labels:
    k8s-app: kube-dns
    kubernetes.io/cluster-service: "true"
    kubernetes.io/name: KubeDNS
  name: kube-dns
  namespace: kube-system
spec:
  ports:
  - name: dns
    port: 53
    protocol: UDP
    targetPort: 53
  - name: dns-tcp
    port: 53
    protocol: TCP
    targetPort: 53
  - name: metrics
    port: 9153
    protocol: TCP
    targetPort: 9153
  selector:
    k8s-app: kube-dns
  sessionAffinity: None
  type: ClusterIP
kubectl apply -f kube-dns-svc.yaml

查看最新的coredns clusterIP,当前为10.244.47.231.

[root@master ~]# kubectl get svc -nkube-system
NAME            TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                  AGE
kube-dns        ClusterIP   10.244.47.231    <none>        53/UDP,53/TCP,9153/TCP   21m

进去之前无法解析的pod中测试,证明新的clusterIP没有问题.

nslookup gateway.default.svc.cluster.local 10.244.47.231

修改kubelet --clusterDNS,这样新创建的pod /etc/resolv.confnameserver为新的coredns clusterIP.

# 修改kubelet配置
vim  /var/lib/kubelet/config.yaml

# 找到clusterDNS
clusterDNS:
- 10.244.47.231

# 重启kubelet生效,注意k8s中所有节点都需要修改重启
systemctl restart kubelet.service

最后测试,新的pod/etc/resolv.conf.解析没有问题.

/ # cat /etc/resolv.conf 
nameserver 10.244.47.231
search default.svc.cluster.local svc.cluster.local cluster.local
options ndots:5