diff --git a/aula-08/README.md b/aula-08/README.md index 834234d..5e29036 100644 --- a/aula-08/README.md +++ b/aula-08/README.md @@ -202,7 +202,47 @@ aula-08/ ├── install-nginx-ingress.sh # Instala NGINX Ingress com LB ├── install-metrics-server.sh # Instala Metrics Server (kubectl top, HPA) ├── nginx-ingress-values.yaml # Configuracao do NGINX Ingress -└── talos-patches/ # Patches de configuracao Talos - ├── control-plane.yaml - └── worker.yaml +├── talos-patches/ # Patches de configuracao Talos +│ ├── control-plane.yaml +│ └── worker.yaml +├── hcloud-csi-values.yaml # Configuracao do CSI Driver +└── statefulset-pdb.yaml # PDB para proteger StatefulSets ``` + +## Troubleshooting: Volume Stuck + +Se um pod ficar `Pending` aguardando volume: + +### 1. Verificar VolumeAttachment + +```bash +kubectl get volumeattachments +kubectl describe volumeattachment +``` + +### 2. Se o node de origem nao existe mais + +```bash +# Deletar o VolumeAttachment orfao (seguro pois node nao existe) +kubectl delete volumeattachment +``` + +### 3. Se o node existe mas pod morreu + +```bash +# Aguardar - Kubernetes vai liberar automaticamente +# Timeout padrao: 6 minutos +``` + +### 4. Verificar no Hetzner + +```bash +hcloud volume list +# Se volume mostra attached a server que nao existe, abrir ticket +``` + +### Limitacoes do Block Storage + +- Volumes Hetzner sao **RWO** (ReadWriteOnce) - single-attach por design +- Podem ficar stuck por ate 6 min (timeout do Kubernetes) +- Se node morrer abruptamente, recuperacao pode ser manual (deletar VolumeAttachment) diff --git a/aula-08/hcloud-csi-values.yaml b/aula-08/hcloud-csi-values.yaml new file mode 100644 index 0000000..0e00b31 --- /dev/null +++ b/aula-08/hcloud-csi-values.yaml @@ -0,0 +1,13 @@ +# Configuracoes para graceful handling de node failures +controller: + tolerations: + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 60 + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 60 + # Mais replicas para HA do controller + replicaCount: 2 diff --git a/aula-08/setup.sh b/aula-08/setup.sh index 676eef7..46528b3 100755 --- a/aula-08/setup.sh +++ b/aula-08/setup.sh @@ -534,6 +534,7 @@ log_info "Instalando CSI Driver via Helm..." helm upgrade --install hcloud-csi hcloud/hcloud-csi \ -n kube-system \ + -f "$SCRIPT_DIR/hcloud-csi-values.yaml" \ --wait \ --timeout 5m @@ -543,6 +544,11 @@ log_success "Hetzner CSI Driver instalado!" log_info "Verificando StorageClass..." kubectl get storageclass hcloud-volumes +# Configurar PDB para StatefulSets (protecao durante drain) +log_info "Criando PodDisruptionBudget para StatefulSets..." +kubectl apply -f "$SCRIPT_DIR/statefulset-pdb.yaml" +log_success "PDB criado" + echo "" ############################################################ diff --git a/aula-08/statefulset-pdb.yaml b/aula-08/statefulset-pdb.yaml new file mode 100644 index 0000000..337b7ac --- /dev/null +++ b/aula-08/statefulset-pdb.yaml @@ -0,0 +1,12 @@ +# PodDisruptionBudget para proteger StatefulSets durante node drain +# Evita que volumes fiquem stuck durante operacoes de manutencao +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: statefulset-pdb + namespace: default +spec: + minAvailable: 0 + selector: + matchLabels: + app.kubernetes.io/component: primary