bookworm-smart-assistant/skills/kubernetes-specialist/references/troubleshooting.md

11 KiB

Kubernetes Troubleshooting

Essential kubectl Commands

Pod Inspection

# Get pods with details
kubectl get pods -n production -o wide
kubectl get pods --all-namespaces
kubectl get pods --field-selector status.phase=Running
kubectl get pods --selector app=web-app

# Describe pod (shows events)
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production

# Get pod logs
kubectl logs web-app-7d5c8b9f4-xk2pm -n production
kubectl logs web-app-7d5c8b9f4-xk2pm -n production --previous  # Previous container
kubectl logs web-app-7d5c8b9f4-xk2pm -n production -c init-container
kubectl logs -f web-app-7d5c8b9f4-xk2pm -n production  # Follow logs
kubectl logs --tail=100 web-app-7d5c8b9f4-xk2pm -n production
kubectl logs --since=1h web-app-7d5c8b9f4-xk2pm -n production

# Get all pod logs from deployment
kubectl logs deployment/web-app -n production --all-containers=true

# Execute commands in pod
kubectl exec -it web-app-7d5c8b9f4-xk2pm -n production -- /bin/sh
kubectl exec web-app-7d5c8b9f4-xk2pm -n production -- env
kubectl exec web-app-7d5c8b9f4-xk2pm -n production -- cat /etc/config/app.yaml

# Copy files to/from pod
kubectl cp web-app-7d5c8b9f4-xk2pm:/app/logs/app.log ./app.log -n production
kubectl cp ./config.yaml web-app-7d5c8b9f4-xk2pm:/tmp/config.yaml -n production

# Port forward
kubectl port-forward web-app-7d5c8b9f4-xk2pm 8080:8080 -n production
kubectl port-forward service/web-app 8080:80 -n production

Deployment Debugging

# Check deployment status
kubectl get deployment web-app -n production
kubectl describe deployment web-app -n production
kubectl rollout status deployment/web-app -n production
kubectl rollout history deployment/web-app -n production

# Check replica sets
kubectl get rs -n production
kubectl describe rs web-app-7d5c8b9f4 -n production

# Scale deployment
kubectl scale deployment web-app --replicas=5 -n production

# Rollback deployment
kubectl rollout undo deployment/web-app -n production
kubectl rollout undo deployment/web-app --to-revision=2 -n production

# Restart deployment (recreate pods)
kubectl rollout restart deployment/web-app -n production

Service and Network Debugging

# Get services
kubectl get svc -n production
kubectl describe svc web-app -n production

# Get endpoints
kubectl get endpoints web-app -n production
kubectl describe endpoints web-app -n production

# Get ingress
kubectl get ingress -n production
kubectl describe ingress web-app -n production

# Get network policies
kubectl get networkpolicy -n production
kubectl describe networkpolicy frontend-to-backend -n production

Resource and Configuration

# Get ConfigMaps and Secrets
kubectl get configmap -n production
kubectl describe configmap app-config -n production
kubectl get configmap app-config -n production -o yaml

kubectl get secret -n production
kubectl describe secret app-secrets -n production
kubectl get secret app-secrets -n production -o jsonpath='{.data.password}' | base64 -d

# Get PVCs and PVs
kubectl get pvc -n production
kubectl describe pvc database-pvc -n production
kubectl get pv

# Get events (sorted by timestamp)
kubectl get events -n production --sort-by='.lastTimestamp'
kubectl get events -n production --field-selector involvedObject.name=web-app-7d5c8b9f4-xk2pm

Debug Pod

Ephemeral Debug Container

# Attach debug container to running pod
kubectl debug -it web-app-7d5c8b9f4-xk2pm -n production \
  --image=busybox:latest \
  --target=web-app

# Create copy of pod with debug tools
kubectl debug web-app-7d5c8b9f4-xk2pm -n production \
  -it \
  --image=ubuntu:latest \
  --share-processes \
  --copy-to=web-app-debug

# Debug with different image
kubectl debug web-app-7d5c8b9f4-xk2pm -n production \
  -it \
  --image=nicolaka/netshoot:latest \
  --target=web-app

Debug on Node

# Create privileged pod on specific node
kubectl debug node/node-01 -it --image=ubuntu:latest

# Access node filesystem
kubectl debug node/node-01 -it --image=ubuntu:latest -- chroot /host

Common Issues and Solutions

Issue 1: Pod in Pending State

# Check pod status and events
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production

# Common causes:
# 1. Insufficient resources
kubectl top nodes
kubectl describe nodes

# 2. PVC not bound
kubectl get pvc -n production
kubectl describe pvc database-pvc -n production

# 3. ImagePullBackOff
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 Events

# 4. Node selector/affinity issues
kubectl get pod web-app-7d5c8b9f4-xk2pm -n production -o yaml | grep -A 5 nodeSelector

Issue 2: CrashLoopBackOff

# Check logs from crashed container
kubectl logs web-app-7d5c8b9f4-xk2pm -n production --previous

# Check if liveness probe is failing
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 "Liveness"

# Debug with different command
kubectl run debug-pod --image=myapp:latest -it --rm --restart=Never -- /bin/sh

# Check resource limits
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 "Limits"

Issue 3: ImagePullBackOff

# Check image pull secret
kubectl get secret registry-credentials -n production -o yaml

# Test image pull manually
kubectl run test-pull --image=myregistry.io/myapp:v1.2.0 \
  --image-pull-policy=Always \
  --restart=Never \
  -n production

# Create/update image pull secret
kubectl create secret docker-registry registry-credentials \
  --docker-server=myregistry.io \
  --docker-username=myuser \
  --docker-password=mypassword \
  --docker-email=user@example.com \
  -n production

Issue 4: Service Not Accessible

# Check service endpoints
kubectl get endpoints web-app -n production
kubectl describe endpoints web-app -n production

# Verify pod labels match service selector
kubectl get pod web-app-7d5c8b9f4-xk2pm -n production --show-labels
kubectl get service web-app -n production -o yaml | grep -A 3 selector

# Test service connectivity from debug pod
kubectl run debug --image=nicolaka/netshoot:latest -it --rm -n production -- bash
# Inside pod:
curl http://web-app.production.svc.cluster.local
nslookup web-app.production.svc.cluster.local
telnet web-app.production.svc.cluster.local 80

Issue 5: DNS Resolution Issues

# Check CoreDNS pods
kubectl get pods -n kube-system -l k8s-app=kube-dns
kubectl logs -n kube-system -l k8s-app=kube-dns

# Test DNS resolution
kubectl run dnsutils --image=tutum/dnsutils -it --rm -- bash
# Inside pod:
nslookup kubernetes.default
nslookup web-app.production.svc.cluster.local
dig web-app.production.svc.cluster.local

# Check DNS config in pod
kubectl exec web-app-7d5c8b9f4-xk2pm -n production -- cat /etc/resolv.conf

Issue 6: NetworkPolicy Blocking Traffic

# List network policies
kubectl get networkpolicy -n production
kubectl describe networkpolicy default-deny-all -n production

# Test connectivity
kubectl run test-connectivity --image=nicolaka/netshoot:latest -it --rm -n production -- bash
# Inside pod:
curl -v http://web-app:80
nc -zv web-app 80

# Temporarily allow all traffic (testing only)
kubectl delete networkpolicy --all -n production

Issue 7: High Resource Usage

# Check resource usage
kubectl top nodes
kubectl top pods -n production
kubectl top pod web-app-7d5c8b9f4-xk2pm -n production --containers

# Check resource requests and limits
kubectl describe pod web-app-7d5c8b9f4-xk2pm -n production | grep -A 10 "Limits"

# Get pods sorted by CPU/memory usage
kubectl top pods -n production --sort-by=cpu
kubectl top pods -n production --sort-by=memory

# Check node capacity
kubectl describe node node-01 | grep -A 10 "Allocated resources"

Issue 8: PersistentVolumeClaim Issues

# Check PVC status
kubectl get pvc -n production
kubectl describe pvc database-pvc -n production

# Check PV status
kubectl get pv
kubectl describe pv pvc-abc123

# Check storage class
kubectl get storageclass
kubectl describe storageclass fast-ssd

# Events related to PVC
kubectl get events -n production --field-selector involvedObject.name=database-pvc

Advanced Debugging

API Server Debugging

# Enable verbose output
kubectl get pods -n production -v=9

# Check API server logs (on master node)
journalctl -u kube-apiserver -f

# Check cluster info
kubectl cluster-info
kubectl cluster-info dump > cluster-dump.txt

RBAC Debugging

# Check if ServiceAccount can perform action
kubectl auth can-i get pods --as=system:serviceaccount:production:web-app-sa -n production

# List permissions for ServiceAccount
kubectl describe sa web-app-sa -n production
kubectl describe role web-app-role -n production
kubectl describe rolebinding web-app-rolebinding -n production

# Check all permissions
kubectl auth can-i --list --as=system:serviceaccount:production:web-app-sa -n production

Performance Debugging

# Get resource metrics
kubectl get --raw /apis/metrics.k8s.io/v1beta1/nodes
kubectl get --raw /apis/metrics.k8s.io/v1beta1/pods

# Check pod overhead
kubectl get pod web-app-7d5c8b9f4-xk2pm -n production -o json | jq '.spec.overhead'

# Check priority classes
kubectl get priorityclasses
kubectl describe priorityclass high-priority

Diagnostic Tools

Network Tools Container

apiVersion: v1
kind: Pod
metadata:
  name: netshoot
  namespace: production
spec:
  containers:
  - name: netshoot
    image: nicolaka/netshoot:latest
    command: ["/bin/sleep", "3600"]
  restartPolicy: Never

Database Client Container

apiVersion: v1
kind: Pod
metadata:
  name: postgres-client
  namespace: production
spec:
  containers:
  - name: postgres
    image: postgres:15-alpine
    command: ["/bin/sleep", "3600"]
    env:
    - name: PGHOST
      value: postgres-service
    - name: PGUSER
      value: myapp
    - name: PGPASSWORD
      valueFrom:
        secretKeyRef:
          name: postgres-secrets
          key: password
  restartPolicy: Never

Quick Reference

Pod States

  • Pending: Waiting to be scheduled
  • ContainerCreating: Pulling image / creating container
  • Running: Pod is running
  • Succeeded: All containers exited successfully
  • Failed: At least one container failed
  • CrashLoopBackOff: Container keeps crashing
  • ImagePullBackOff: Cannot pull image
  • ErrImagePull: Image pull error
  • Unknown: Cannot get pod status

Common Exit Codes

  • 0: Success
  • 1: General error
  • 137: SIGKILL (OOMKilled - out of memory)
  • 139: SIGSEGV (segmentation fault)
  • 143: SIGTERM (graceful termination)

Best Practices

  1. Logs: Always check logs first with kubectl logs
  2. Events: Use kubectl describe to see events
  3. Labels: Use consistent labels for easier debugging
  4. Resources: Set appropriate requests and limits
  5. Health Checks: Implement proper liveness and readiness probes
  6. Monitoring: Set up comprehensive monitoring and alerting
  7. Debug Tools: Keep debug containers ready
  8. Documentation: Document common issues and solutions