A production Kubernetes cluster is experiencing issues: pods are not starting, services are unreachable, and nodes are reporting as NotReady. You need to systematically troubleshoot and resolve these issues.
NotReady statuskubectl get nodes shows:
NAME STATUS ROLES AGE VERSION
ip-10-0-1-10.ec2.internal Ready <none> 30d v1.28.0
ip-10-0-2-20.ec2.internal NotReady <none> 30d v1.28.0
ip-10-0-3-30.ec2.internal NotReady <none> 30d v1.28.0
Pending stateCrashLoopBackOff# Get node status
kubectl get nodes
# Describe problematic nodes
kubectl describe node ip-10-0-2-20.ec2.internal
# Check node conditions
kubectl get nodes -o json | jq '.items[] | {name: .metadata.name, conditions: .status.conditions}'
What to Look For:
Common Issues:
# Check node resource usage
kubectl top nodes
# Check node capacity and allocatable
kubectl describe node ip-10-0-2-20.ec2.internal | grep -A 5 "Allocated resources"
# Check for resource pressure
kubectl get nodes -o custom-columns=NAME:.metadata.name,\
MEMORY-PRESSURE:.status.conditions[?(@.type=='MemoryPressure')].status,\
DISK-PRESSURE:.status.conditions[?(@.type=='DiskPressure')].status
Actions:
# SSH to node (if possible)
ssh ec2-user@ip-10-0-2-20.ec2.internal
# Check kubelet status
sudo systemctl status kubelet
# Check kubelet logs
sudo journalctl -u kubelet -n 100 --no-pager
# Check kubelet configuration
cat /var/lib/kubelet/config.yaml
Common kubelet Issues:
systemctl start kubelet# On node, check container runtime
sudo systemctl status containerd
# or
sudo systemctl status docker
# Check runtime logs
sudo journalctl -u containerd -n 100
# Test container runtime
sudo crictl images
sudo crictl ps -a
Common Issues:
# Check pending pods
kubectl get pods --all-namespaces --field-selector=status.phase=Pending
# Describe pending pod
kubectl describe pod <pod-name> -n <namespace>
# Check events
kubectl get events --all-namespaces --sort-by='.lastTimestamp' | tail -20
Common Scheduling Issues:
# Check CoreDNS pods
kubectl get pods -n kube-system -l k8s-app=kube-dns
# Check CoreDNS logs
kubectl logs -n kube-system -l k8s-app=kube-dns
# Test DNS from pod
kubectl run -it --rm debug --image=busybox --restart=Never -- nslookup kubernetes.default
# Check service endpoints
kubectl get endpoints -A
# Check if endpoints are empty
kubectl get svc <service-name> -o jsonpath='{.spec.selector}'
kubectl get pods -l <selector> --show-labels
Common Network Issues:
# Check API server status (if you have access)
kubectl get componentstatuses
# or
kubectl get cs
# Check API server logs (if on master)
kubectl logs -n kube-system kube-apiserver-<node-name>
# Test API server connectivity
kubectl cluster-info
Common Control Plane Issues:
# Check PVCs
kubectl get pvc -A
# Check PVs
kubectl get pv
# Check storage classes
kubectl get storageclass
# Describe pending PVC
kubectl describe pvc <pvc-name> -n <namespace>
Common Storage Issues:
Symptoms:
NotReadykubectl describe node shows DiskPressure: TrueResolution:
# SSH to node
ssh ec2-user@ip-10-0-2-20.ec2.internal
# Check disk usage
df -h
# Clean up Docker/containerd images
sudo docker system prune -a --volumes
# or for containerd
sudo crictl rmi --prune
# Clean up old logs
sudo journalctl --vacuum-time=7d
# If still full, increase disk size (AWS)
# 1. Create snapshot
# 2. Resize EBS volume
# 3. Extend filesystem
sudo growpart /dev/nvme0n1 1
sudo resize2fs /dev/nvme0n1p1
# Restart kubelet
sudo systemctl restart kubelet
# Verify node status
kubectl get nodes
Symptoms:
Pending0/3 nodes are availableResolution:
# Check why pods can't be scheduled
kubectl describe pod <pod-name> | grep -A 10 "Events:"
# Common reasons:
# 1. Resource constraints
kubectl describe node | grep -A 5 "Allocated resources"
# 2. Node selectors
kubectl get pods -o jsonpath='{.spec.nodeSelector}'
# 3. Taints
kubectl describe node | grep Taint
# Solutions:
# Option 1: Add more nodes
# Option 2: Remove node selector or add label
kubectl label node <node-name> <key>=<value>
# Option 3: Remove taint (if appropriate)
kubectl taint nodes <node-name> <key>-<effect>-
# Option 4: Add toleration to pod
Symptoms:
kubectl get endpoints shows no endpointsResolution:
# Check service selector
kubectl get svc <service-name> -o jsonpath='{.spec.selector}'
# Check if pods match selector
kubectl get pods -l app=web --show-labels
# Common issue: Selector doesn't match pod labels
# Fix: Update service or pod labels
# Update service selector
kubectl patch svc <service-name> -p '{"spec":{"selector":{"app":"web"}}}'
# Or update pod labels
kubectl label pods <pod-name> app=web
# Verify endpoints
kubectl get endpoints <service-name>
Symptoms:
nslookup failsResolution:
# Check CoreDNS pods
kubectl get pods -n kube-system -l k8s-app=kube-dns
# If not running, check why
kubectl describe pod -n kube-system -l k8s-app=kube-dns
# Check CoreDNS logs
kubectl logs -n kube-system -l k8s-app=kube-dns
# Check CoreDNS config
kubectl get configmap coredns -n kube-system -o yaml
# Restart CoreDNS if needed
kubectl delete pod -n kube-system -l k8s-app=kube-dns
# Verify DNS
kubectl run -it --rm debug --image=busybox --restart=Never -- nslookup kubernetes.default
# Set resource requests and limits
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
# Enable cluster autoscaler
# Automatically adds nodes when needed
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: web-pdb
spec:
minAvailable: 2
selector:
matchLabels:
app: web
kubectl get nodes)kubectl top nodes)kubectl get pods -A)kubectl describe pod)kubectl get endpoints)kubectl run debug -- nslookup)kubectl describe for detailed information