diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4de7722 --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +# OpenTofu / Terraform +**/.terraform/ +**/.tofu/ +**/*.tfstate +**/*.tfstate.* +**/tfplan +**/tfplan.out +**/.terraform.lock.hcl + +# Credenciais e configs sensíveis +**/terraform.tfvars +**/kubeconfig +**/kubeconfig-* +**/talosconfig +**/*.pem +**/*.key + +# OS +.DS_Store +Thumbs.db + +# Editor +*.swp +*.swo +*~ +.idea/ +.vscode/ + +# Node (aula-01) +node_modules/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..6e18c2d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,114 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a workshop repository for teaching Docker and Kubernetes concepts, specifically focusing on container health checks and liveness probes. It contains a deliberately "buggy" Node.js app that hangs after a configurable number of requests to demonstrate how container orchestration handles unhealthy containers. + +## Repository Structure + +- **aula-01/**: Docker Compose lesson - basic container deployment with restart policies +- **aula-02/**: Kubernetes lesson - deployment with liveness probes and ConfigMaps +- **aula-03/**: Kubernetes lesson - high availability with replicas and readiness probes +- **aula-04/**: Kubernetes lesson - NGINX Ingress with Keep Request (Lua) for zero-downtime +- **aula-05/**: Kubernetes lesson - KEDA + Victoria Metrics for metrics-based auto-scaling +- **aula-06/**: Kubernetes lesson - n8n deployment via Helm with Queue Mode (workers, webhooks, PostgreSQL, Redis) +- **aula-07/**: Talos Linux - creating custom Talos image for Hetzner Cloud +- **aula-08/**: OpenTofu - provisioning HA Talos Kubernetes cluster on Hetzner Cloud + +## Running the Examples + +### Aula 01 (Docker Compose) +```bash +cd aula-01 +docker-compose up +``` +The app runs on port 3000. After MAX_REQUESTS (default 3), the app stops responding. + +### Aula 02 (Kubernetes) +```bash +cd aula-02 +kubectl apply -f configmap.yaml +kubectl apply -f deployment.yaml +kubectl apply -f service.yaml +``` +Access via NodePort 30080. The liveness probe at `/health` will detect when the app hangs and restart the container. + +### Aula 03 (Kubernetes - High Availability) +```bash +cd aula-03 +kubectl apply -f configmap.yaml +kubectl apply -f deployment.yaml +kubectl apply -f service.yaml +``` +Builds on Aula 02 with multiple replicas and a readiness probe. When one pod hangs, the others continue serving requests. The readiness probe removes unhealthy pods from the Service immediately, while the liveness probe restarts them. + +### Aula 04 (Kubernetes - NGINX Ingress with Keep Request) +Requires NGINX Ingress Controller with Lua support. + +```bash +cd aula-04 +kubectl apply -f configmap.yaml +kubectl apply -f deployment.yaml +kubectl apply -f service.yaml +kubectl apply -f ingress-nginx.yaml +``` +Access via NGINX Ingress. The Keep Request pattern uses Lua to hold requests when backends are unavailable, waiting up to 99s for a pod to become ready instead of returning 503 immediately. This eliminates user-visible failures during pod restarts. + +### Aula 05 (Kubernetes - KEDA Auto-scaling) +```bash +cd aula-05 +./setup.sh +``` +Installs Victoria Metrics (metrics collection), KEDA (event-driven autoscaling), and NGINX Ingress. The ScaledObject monitors metrics like unavailable pods and restart counts, automatically scaling the deployment from 5 to 30 replicas based on demand. + +### Aula 06 (Kubernetes - n8n via Helm) +```bash +cd aula-06 +./setup.sh +``` +Deploys n8n workflow automation platform via Helm chart with Queue Mode architecture: main node, workers (2-5 replicas with HPA), webhooks (1-3 replicas with HPA), PostgreSQL, and Redis. Access via http://n8n.localhost (requires NGINX Ingress). + +### Aula 07 (Talos Linux - Custom Image) +Follow the instructions in `aula-07/README.md` to create a custom Talos Linux image on Hetzner Cloud using Talos Factory. This is a prerequisite for Aula 08. + +### Aula 08 (OpenTofu - Talos Cluster on Hetzner Cloud) +```bash +cd aula-08 +./setup.sh +``` +Provisions a full HA Kubernetes cluster on Hetzner Cloud using OpenTofu: +- 3x Control Plane nodes (CAX11 ARM64) +- 1x Worker node (CAX11 ARM64) +- Private network, Floating IP, Firewall +- Cluster Autoscaler support (1-5 workers) +- Estimated cost: ~€18/month (base), up to ~€33/month with max autoscaling + +Prerequisites: +- OpenTofu (`brew install opentofu`) +- talosctl (`brew install siderolabs/tap/talosctl`) +- kubectl +- Hetzner Cloud API token +- Talos image ID from Aula 07 + +Optional - Enable cluster autoscaling: +```bash +./install-autoscaler.sh +``` +This installs the Kubernetes Cluster Autoscaler configured for Hetzner Cloud, automatically scaling workers from 1 to 5 based on pending pods. + +To destroy the infrastructure: `./cleanup.sh` + +## App Behavior + +The Node.js app (`app.js`) is intentionally designed to: +1. Accept requests normally until `MAX_REQUESTS` is reached +2. Stop responding (hang) after the limit, simulating a crashed but running process +3. The `/health` endpoint also stops responding when the app is "stuck" + +This behavior demonstrates why process-level monitoring (restart: always) is insufficient and why application-level health checks (liveness probes) are necessary. + +## Environment Variables + +- `MAX_REQUESTS`: Number of requests before the app hangs (default: 3) diff --git a/aula-04/ingress.yaml- b/aula-04/ingress.yaml- deleted file mode 100644 index 645a230..0000000 --- a/aula-04/ingress.yaml- +++ /dev/null @@ -1,26 +0,0 @@ -# Middleware Retry - tenta outros pods quando um falha -apiVersion: traefik.io/v1alpha1 -kind: Middleware -metadata: - name: retry-middleware -spec: - retry: - attempts: 5 # 5 tentativas - initialInterval: 500ms # 500ms entre ciclos ---- -# IngressRoute -apiVersion: traefik.io/v1alpha1 -kind: IngressRoute -metadata: - name: node-bugado -spec: - entryPoints: - - web - routes: - - match: PathPrefix(`/`) - kind: Rule - middlewares: - - name: retry-middleware - services: - - name: node-bugado - port: 3000 diff --git a/aula-06/setup.sh b/aula-06/setup.sh index 7f82ee6..3ea0bf2 100755 --- a/aula-06/setup.sh +++ b/aula-06/setup.sh @@ -204,7 +204,7 @@ echo " # Ver todos os pods" echo " kubectl get pods -n n8n" echo "" echo " # Ver logs do n8n" -echo " kubectl logs -f -l app.kubernetes.io/component=main -n n8n" +echo " kubectl logs -f -n n8n deployment/n8n" echo "" echo " # Ver HPA (autoscaler)" echo " kubectl get hpa -n n8n" @@ -218,6 +218,12 @@ echo "" echo " # Fazer upgrade do helm chart" echo " helm upgrade --reuse-values --values --custom-values.yaml n8n community-charts/n8n --namespace n8n" echo "" +echo " # Verificar historico de releases" +echo " helm history n8n -n n8n" +echo "" +echo " # Fazer rollback do historico de releases" +echo " helm rollback n8n " +echo "" echo "==============================================" echo "" diff --git a/aula-07/README.md b/aula-07/README.md new file mode 100644 index 0000000..ad2428f --- /dev/null +++ b/aula-07/README.md @@ -0,0 +1,63 @@ +--- +criado: 2025-12-27T01:10:54-03:00 +atualizado: 2025-12-27T02:25:34-03:00 +--- + + +A Hetzner Cloud não oferece suporte ao upload de imagens personalizadas. Somente via suporte [issue 3599](https://github.com/siderolabs/talos/issues/3599#issuecomment-841172018)  + +Workaround +1. Execute uma instância no modo de recuperação e substitua o sistema operacional pela imagem do Talos. +2. 🚧 De a cordo com a documentacao oficial é possivel usar [o Hashicorp Packer](https://www.packer.io/docs/builders/hetzner-cloud) para preparar uma imagem. Mas a documentação oficial foi removida dos builders. E nos meus testes deu kernel panic.... + + +Passo 1 -> https://factory.talos.dev/ + +- [ ] siderolabs/amd-ucode / siderolabs/intel-ucode + - Spectre / Meltdown (V1, V2, V4) + - Predição de desvios + - Leitura de memória privilegiada a partir de userland + - Zenbleed (CVE-2023-20593) + - CPUs AMD Zen 2 + - Vazamento de registros via execução especulativa + - Impacta **VMs e containers** + - Speculative Return Stack Overflow (SRSO) + - CPUs AMD modernas + - Jailbreak +- [ ] siderolabs/qemu-guest-agent (Hetzner usa QEMU / KVM) +- [ ] siderolabs/stargz-snapshotter (https://github.com/containerd/stargz-snapshotter) +- [ ] siderolabs/util-linux-tools (lsblk, mount, findmnt) +- [ ] siderolabs/binfmt-misc (Se for usar imagem multi-arch) +- siderolabs/tailscale OU cloudflared -> https://spot.rackspace.com/ +- zfs -> Se for Baremetal (~50% mais rapido que ext4) + +bootloader: dual-boot + +https://factory.talos.dev/?arch=amd64&board=undefined&bootloader=dual-boot&cmdline-set=true&extensions=-&extensions=siderolabs%2Famd-ucode&extensions=siderolabs%2Fbinfmt-misc&extensions=siderolabs%2Fintel-ucode&extensions=siderolabs%2Fqemu-guest-agent&extensions=siderolabs%2Fstargz-snapshotter&extensions=siderolabs%2Futil-linux-tools&platform=hcloud&secureboot=undefined&target=cloud&version=1.12.0 + + + +```bash +# Testar se vc entrou em Rescue mode + df + + ### Resultado será tipo: + # Filesystem 1K-blocks Used Available Use% Mounted on + # udev 987432 0 987432 0% /dev + # 213.133.99.101:/nfs 308577696 247015616 45817536 85% /root/.oldroot/nfs + # overlay 995672 8340 987332 1% / + # tmpfs 995672 0 995672 0% /dev/shm + # tmpfs 398272 572 397700 1% /run + # tmpfs 5120 0 5120 0% /run/lock + # tmpfs 199132 0 199132 0% /run/user/0 + + # Baixar a imagem do Talos + cd /tmp + wget -O /tmp/talos.raw.xz https://factory.talos.dev/image/c4f17c623d4ac547a243489f1b3285afd64a76b491b1c5c24ef6363587cef55f/v1.12.0/hcloud-amd64.raw.xz + + # Escrever o sistema (Vai demorar uns 4 a 5 minutos) + xz -d -c /tmp/talos.raw.xz | dd of=/dev/sda && sync + + # Desligue a instancia antes do snapshot + shutdown -h now + ``` diff --git a/aula-08/cleanup.sh b/aula-08/cleanup.sh new file mode 100755 index 0000000..977e3bd --- /dev/null +++ b/aula-08/cleanup.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +############################################################ +# Aula 08 - Cleanup +# Destrói a infraestrutura provisionada +############################################################ + +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[OK]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +echo "" +echo "============================================" +echo " Cleanup - Destruir Infraestrutura" +echo "============================================" +echo "" + +# Verificar se tofu está instalado +if ! command -v tofu &> /dev/null; then + log_error "OpenTofu não encontrado!" + exit 1 +fi + +# Verificar se há state +if [ ! -f "terraform.tfstate" ] && [ ! -d ".terraform" ]; then + log_warn "Nenhuma infraestrutura encontrada para destruir." + exit 0 +fi + +# Verificar workers do autoscaler (criados fora do OpenTofu) +if [ -f "kubeconfig" ]; then + export KUBECONFIG="$SCRIPT_DIR/kubeconfig" + + AUTOSCALER_WORKERS=$(kubectl get nodes -l node.kubernetes.io/instance-type=cax11 \ + --no-headers 2>/dev/null | wc -l | tr -d ' ' || echo "0") + + if [ "$AUTOSCALER_WORKERS" -gt "1" ]; then + log_warn "Detectados $AUTOSCALER_WORKERS workers (incluindo os do autoscaler)" + log_warn "Workers criados pelo autoscaler serão removidos via API Hetzner" + echo "" + fi +fi + +log_warn "ATENÇÃO: Esta operação irá DESTRUIR todos os recursos!" +echo "" +echo "Recursos que serão removidos:" +echo " - 3x Control Plane nodes" +echo " - Workers (incluindo os criados pelo autoscaler)" +echo " - Rede privada" +echo " - Floating IP" +echo " - Firewall" +echo " - Placement Group" +echo "" + +read -p "Tem certeza que deseja continuar? (digite 'sim' para confirmar): " confirm + +if [ "$confirm" != "sim" ]; then + log_info "Operação cancelada" + exit 0 +fi + +echo "" + +# Remover workers do autoscaler primeiro (se existirem) +if [ -f "terraform.tfvars" ]; then + HCLOUD_TOKEN=$(grep 'hcloud_token' terraform.tfvars | cut -d'"' -f2) + CLUSTER_NAME=$(tofu output -raw cluster_name 2>/dev/null || echo "") + + if [ -n "$HCLOUD_TOKEN" ] && [ -n "$CLUSTER_NAME" ]; then + log_info "Verificando workers do autoscaler..." + + # Listar servers com label do cluster que NÃO são gerenciados pelo tofu + AUTOSCALER_SERVERS=$(HCLOUD_TOKEN="$HCLOUD_TOKEN" hcloud server list \ + -l cluster="$CLUSTER_NAME" \ + -o noheader -o columns=id,name 2>/dev/null | \ + grep -E "worker-pool" || true) + + if [ -n "$AUTOSCALER_SERVERS" ]; then + log_warn "Removendo workers criados pelo autoscaler..." + echo "$AUTOSCALER_SERVERS" | while read -r server_id server_name; do + log_info " Removendo $server_name (ID: $server_id)..." + HCLOUD_TOKEN="$HCLOUD_TOKEN" hcloud server delete "$server_id" --quiet 2>/dev/null || true + done + log_success "Workers do autoscaler removidos" + fi + fi +fi + +echo "" +log_info "Destruindo infraestrutura via OpenTofu..." +echo "" + +tofu destroy -auto-approve + +echo "" +log_success "Infraestrutura destruída!" +echo "" + +# Limpar arquivos locais (manter .terraform para re-deploy rápido) +log_info "Limpando arquivos gerados..." + +rm -f kubeconfig talosconfig tfplan terraform.tfstate terraform.tfstate.backup + +log_success "Arquivos removidos" +echo "" + +# Perguntar sobre terraform.tfvars +if [ -f "terraform.tfvars" ]; then + read -p "Remover terraform.tfvars também? (s/N): " remove_tfvars + if [[ "$remove_tfvars" =~ ^[Ss]$ ]]; then + rm -f terraform.tfvars + log_success "terraform.tfvars removido" + else + log_info "terraform.tfvars mantido (útil para re-deploy)" + fi +fi + +echo "" +log_success "Cleanup concluído!" diff --git a/aula-08/cluster-autoscaler.yaml b/aula-08/cluster-autoscaler.yaml new file mode 100644 index 0000000..bb81910 --- /dev/null +++ b/aula-08/cluster-autoscaler.yaml @@ -0,0 +1,158 @@ +############################################################ +# Cluster Autoscaler para Hetzner Cloud + Talos +# Escala workers automaticamente de 1 a 5 nodes +############################################################ + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: cluster-autoscaler + +# Secret is created via install-autoscaler.sh (kubectl create secret) +# to properly handle base64 encoding of cloud-init + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cluster-autoscaler + namespace: cluster-autoscaler + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-autoscaler +rules: + - apiGroups: [""] + resources: ["events", "endpoints"] + verbs: ["create", "patch"] + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + - apiGroups: [""] + resources: ["pods/status"] + verbs: ["update"] + - apiGroups: [""] + resources: ["endpoints"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["watch", "list", "get", "update"] + - apiGroups: [""] + resources: ["namespaces", "pods", "services", "replicationcontrollers", "persistentvolumeclaims", "persistentvolumes"] + verbs: ["watch", "list", "get"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["watch", "list", "get"] + - apiGroups: ["batch", "extensions"] + resources: ["jobs"] + verbs: ["get", "list", "patch", "watch"] + - apiGroups: ["extensions"] + resources: ["replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["watch", "list"] + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"] + verbs: ["watch", "list", "get"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "get", "update", "delete", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-autoscaler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-autoscaler +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: cluster-autoscaler + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster-autoscaler + namespace: cluster-autoscaler + labels: + app: cluster-autoscaler +spec: + replicas: 1 + selector: + matchLabels: + app: cluster-autoscaler + template: + metadata: + labels: + app: cluster-autoscaler + spec: + serviceAccountName: cluster-autoscaler + # Use host network to access external APIs (Hetzner) + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + # Workaround: Talos DNS proxy doesn't forward to upstream correctly + hostAliases: + - ip: "213.239.246.73" + hostnames: + - "api.hetzner.cloud" + containers: + - name: cluster-autoscaler + image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.31.0 + command: + - ./cluster-autoscaler + - --cloud-provider=hetzner + - --nodes=0:5:CAX11:nbg1:worker-pool + - --nodes=0:0:CAX11:nbg1:draining-node-pool + - --scale-down-enabled=true + - --scale-down-delay-after-add=5m + - --scale-down-unneeded-time=3m + - --scale-down-utilization-threshold=0.5 + - --skip-nodes-with-local-storage=false + - --skip-nodes-with-system-pods=false + - --balance-similar-node-groups=true + - --v=4 + env: + - name: HCLOUD_TOKEN + valueFrom: + secretKeyRef: + name: hcloud-autoscaler + key: token + - name: HCLOUD_CLOUD_INIT + valueFrom: + secretKeyRef: + name: hcloud-autoscaler + key: cloud-init + - name: HCLOUD_IMAGE + value: "${TALOS_IMAGE_ID}" + - name: HCLOUD_NETWORK + value: "${NETWORK_NAME}" + - name: HCLOUD_FIREWALL + value: "${FIREWALL_NAME}" + - name: HCLOUD_SSH_KEY + value: "${SSH_KEY_NAME}" + resources: + requests: + cpu: 100m + memory: 300Mi + limits: + cpu: 500m + memory: 500Mi diff --git a/aula-08/install-autoscaler.sh b/aula-08/install-autoscaler.sh new file mode 100755 index 0000000..33172ed --- /dev/null +++ b/aula-08/install-autoscaler.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +############################################################ +# Instala o Cluster Autoscaler no cluster Talos +# Requer: cluster provisionado via setup.sh +############################################################ + +set -e + +# Cores +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[OK]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +echo "" +echo "============================================" +echo " Instalando Cluster Autoscaler" +echo "============================================" +echo "" + +# Verificar pré-requisitos +if [ ! -f "kubeconfig" ]; then + log_error "kubeconfig não encontrado! Execute setup.sh primeiro." + exit 1 +fi + +if [ ! -f "terraform.tfvars" ]; then + log_error "terraform.tfvars não encontrado!" + exit 1 +fi + +export KUBECONFIG="$SCRIPT_DIR/kubeconfig" + +# Verificar conexão com cluster +log_info "Verificando conexão com o cluster..." +if ! kubectl get nodes &>/dev/null; then + log_error "Não foi possível conectar ao cluster!" + exit 1 +fi +log_success "Conectado ao cluster" + +# Obter valores do OpenTofu +log_info "Obtendo configurações do OpenTofu..." + +WORKER_CONFIG_BASE64=$(tofu output -raw autoscaler_worker_config 2>/dev/null) +TALOS_IMAGE_ID=$(tofu output -raw autoscaler_image_id 2>/dev/null) +CLUSTER_NAME=$(tofu output -raw cluster_name 2>/dev/null) +NETWORK_ID=$(tofu output -raw network_id 2>/dev/null) +FIREWALL_ID=$(tofu output -raw firewall_id 2>/dev/null) +SSH_KEY_NAME=$(tofu output -raw ssh_key_name 2>/dev/null) + +# Obter token do terraform.tfvars +HCLOUD_TOKEN=$(grep 'hcloud_token' terraform.tfvars | cut -d'"' -f2) + +if [ -z "$WORKER_CONFIG_BASE64" ] || [ -z "$HCLOUD_TOKEN" ]; then + log_error "Não foi possível obter as configurações necessárias!" + exit 1 +fi + +log_success "Configurações obtidas" +echo " - Cluster: $CLUSTER_NAME" +echo " - Image ID: $TALOS_IMAGE_ID" +echo " - Network ID: $NETWORK_ID" +echo " - SSH Key: $SSH_KEY_NAME" +echo "" + +# Criar namespace com política privileged (necessário para hostNetwork) +log_info "Criando namespace cluster-autoscaler..." +kubectl create namespace cluster-autoscaler --dry-run=client -o yaml | kubectl apply -f - +kubectl label namespace cluster-autoscaler pod-security.kubernetes.io/enforce=privileged --overwrite + +# Criar secret com credenciais +log_info "Criando secret com credenciais..." +kubectl create secret generic hcloud-autoscaler \ + --namespace cluster-autoscaler \ + --from-literal=token="$HCLOUD_TOKEN" \ + --from-literal=cloud-init="$WORKER_CONFIG_BASE64" \ + --dry-run=client -o yaml | kubectl apply -f - + +log_success "Secret criado" + +# Aplicar RBAC e Deployment +log_info "Aplicando manifesto do cluster-autoscaler..." + +# Substituir variáveis no template e aplicar +cat cluster-autoscaler.yaml | \ + sed "s|\${TALOS_IMAGE_ID}|$TALOS_IMAGE_ID|g" | \ + sed "s|\${NETWORK_NAME}|$CLUSTER_NAME-network|g" | \ + sed "s|\${FIREWALL_NAME}|$CLUSTER_NAME-firewall|g" | \ + sed "s|\${SSH_KEY_NAME}|$SSH_KEY_NAME|g" | \ + kubectl apply -f - + +log_success "Cluster Autoscaler instalado!" + +# Aguardar pod ficar pronto +log_info "Aguardando pod do autoscaler..." +kubectl wait --for=condition=ready pod \ + -l app=cluster-autoscaler \ + -n cluster-autoscaler \ + --timeout=120s + +echo "" +log_success "Cluster Autoscaler pronto!" + +echo "" +echo "============================================" +echo " Configuração do Autoscaler" +echo "============================================" +echo "" +echo " Pool: worker-pool" +echo " Tipo: CAX11 (ARM64)" +echo " Região: nbg1 (Nuremberg)" +echo " Min nodes: 1" +echo " Max nodes: 5" +echo "" +echo " Scale down após: 5 minutos" +echo " Utilização mínima: 50%" +echo "" +echo "Comandos úteis:" +echo "" +echo " # Ver logs do autoscaler" +echo " kubectl logs -n cluster-autoscaler -l app=cluster-autoscaler -f" +echo "" +echo " # Ver status dos nodes" +echo " kubectl get nodes" +echo "" +echo " # Testar scale up (criar pods pending)" +echo " kubectl create deployment test --image=nginx --replicas=10" +echo "" diff --git a/aula-08/main.tf b/aula-08/main.tf new file mode 100644 index 0000000..ec4aea7 --- /dev/null +++ b/aula-08/main.tf @@ -0,0 +1,391 @@ +############################################################ +# Hetzner Talos Kubernetes Cluster - Base Infrastructure +# Using custom Talos image created from ISO +############################################################ + +############################################################ +# PROVIDERS CONFIGURATION +############################################################ + +provider "hcloud" { + token = var.hcloud_token +} + +############################################################ +# DATA SOURCES +############################################################ + +# Use the custom Talos image created in aula-07 +data "hcloud_image" "talos" { + id = var.talos_image_id +} + +############################################################ +# RANDOM RESOURCES +############################################################ + +resource "random_string" "cluster_id" { + length = 6 + special = false + lower = true + upper = false +} + +locals { + cluster_name = "talos-${random_string.cluster_id.result}" + common_labels = { + cluster = local.cluster_name + environment = var.environment + managed_by = "terraform" + } +} + +############################################################ +# SSH KEY (for emergency access only) +############################################################ + +data "hcloud_ssh_keys" "all" {} + +locals { + ssh_key_normalized = trimspace(split(" ", var.ssh_public_key)[0] == "ssh-rsa" ? + join(" ", slice(split(" ", var.ssh_public_key), 0, 2)) : + var.ssh_public_key) + + ssh_key_matches = [ + for key in data.hcloud_ssh_keys.all.ssh_keys : key.id + if key.public_key == local.ssh_key_normalized || key.public_key == var.ssh_public_key + ] + + ssh_key_id = length(local.ssh_key_matches) > 0 ? local.ssh_key_matches[0] : hcloud_ssh_key.admin[0].id +} + +resource "hcloud_ssh_key" "admin" { + count = length(local.ssh_key_matches) == 0 ? 1 : 0 + name = "${local.cluster_name}-admin" + public_key = var.ssh_public_key + labels = local.common_labels +} + +############################################################ +# NETWORK CONFIGURATION +############################################################ + +resource "hcloud_network" "cluster" { + name = "${local.cluster_name}-network" + ip_range = "10.0.0.0/16" + labels = local.common_labels +} + +resource "hcloud_network_subnet" "cluster" { + type = "cloud" + network_id = hcloud_network.cluster.id + network_zone = "eu-central" + ip_range = "10.0.1.0/24" +} + +############################################################ +# FIREWALL CONFIGURATION +############################################################ + +resource "hcloud_firewall" "cluster" { + name = "${local.cluster_name}-firewall" + labels = local.common_labels + + # Talos API access + rule { + direction = "in" + protocol = "tcp" + port = "50000" + source_ips = ["0.0.0.0/0", "::/0"] + } + + # Kubernetes API + rule { + direction = "in" + protocol = "tcp" + port = "6443" + source_ips = ["0.0.0.0/0", "::/0"] + } + + # Allow HTTP/HTTPS for Ingress + rule { + direction = "in" + protocol = "tcp" + port = "80" + source_ips = ["0.0.0.0/0", "::/0"] + } + + rule { + direction = "in" + protocol = "tcp" + port = "443" + source_ips = ["0.0.0.0/0", "::/0"] + } + + # Allow NodePort range (for services) + rule { + direction = "in" + protocol = "tcp" + port = "30000-32767" + source_ips = ["0.0.0.0/0", "::/0"] + } + + # Allow all outbound traffic + rule { + direction = "out" + protocol = "tcp" + port = "any" + destination_ips = ["0.0.0.0/0", "::/0"] + } + + rule { + direction = "out" + protocol = "udp" + port = "any" + destination_ips = ["0.0.0.0/0", "::/0"] + } + + rule { + direction = "out" + protocol = "icmp" + destination_ips = ["0.0.0.0/0", "::/0"] + } +} + +############################################################ +# PLACEMENT GROUP (keep nodes close for low latency) +############################################################ + +resource "hcloud_placement_group" "cluster" { + name = "${local.cluster_name}-pg" + type = "spread" + labels = local.common_labels +} + +############################################################ +# CONTROL PLANE NODES (HA with 3 CAX11 nodes) +############################################################ + +resource "hcloud_server" "control_plane" { + count = 3 + name = "${local.cluster_name}-cp-${count.index}" + server_type = "cax11" + image = data.hcloud_image.talos.id + location = "nbg1" # CAX11 only available in Nuremberg + ssh_keys = [local.ssh_key_id] + + firewall_ids = [hcloud_firewall.cluster.id] + placement_group_id = hcloud_placement_group.cluster.id + + labels = merge(local.common_labels, { + role = "control-plane" + node = "cp-${count.index}" + arch = "arm64" + }) + + public_net { + ipv4_enabled = true + ipv6_enabled = true + } + + lifecycle { + ignore_changes = [ssh_keys] + } +} + +resource "hcloud_server_network" "control_plane" { + count = 3 + server_id = hcloud_server.control_plane[count.index].id + network_id = hcloud_network.cluster.id + ip = "10.0.1.${10 + count.index}" +} + +# Floating IP for stable control plane access +resource "hcloud_floating_ip" "control_plane" { + type = "ipv4" + name = "${local.cluster_name}-cp-ip" + home_location = "nbg1" + labels = local.common_labels +} + +resource "hcloud_floating_ip_assignment" "control_plane" { + floating_ip_id = hcloud_floating_ip.control_plane.id + server_id = hcloud_server.control_plane[0].id +} + +############################################################ +# WORKER NODE (Single CAX11) +############################################################ + +resource "hcloud_server" "worker" { + count = 1 + name = "${local.cluster_name}-worker-${count.index}" + server_type = "cax11" + image = data.hcloud_image.talos.id + location = "nbg1" + ssh_keys = [local.ssh_key_id] + + firewall_ids = [hcloud_firewall.cluster.id] + placement_group_id = hcloud_placement_group.cluster.id + + labels = merge(local.common_labels, { + role = "worker" + node = "worker-${count.index}" + arch = "arm64" + }) + + public_net { + ipv4_enabled = true + ipv6_enabled = true + } + + lifecycle { + ignore_changes = [ssh_keys] + } +} + +resource "hcloud_server_network" "worker" { + count = 1 + server_id = hcloud_server.worker[count.index].id + network_id = hcloud_network.cluster.id + ip = "10.0.1.${20 + count.index}" +} + +############################################################ +# TALOS CONFIGURATION +############################################################ + +# Generate Talos machine secrets +resource "talos_machine_secrets" "this" { + talos_version = var.talos_version +} + +# Generate Talos client configuration +data "talos_client_configuration" "this" { + cluster_name = local.cluster_name + client_configuration = talos_machine_secrets.this.client_configuration + endpoints = [hcloud_floating_ip.control_plane.ip_address] +} + +# Control plane configuration +data "talos_machine_configuration" "control_plane" { + count = 3 + cluster_name = local.cluster_name + machine_type = "controlplane" + cluster_endpoint = "https://${hcloud_floating_ip.control_plane.ip_address}:6443" + machine_secrets = talos_machine_secrets.this.machine_secrets + talos_version = var.talos_version + + config_patches = [ + templatefile("${path.module}/talos-patches/control-plane.yaml", { + cluster_name = local.cluster_name + node_name = hcloud_server.control_plane[count.index].name + is_ha = true + is_first_cp = count.index == 0 + etcd_peers = [for i in range(3) : "10.0.1.${10 + i}"] + floating_ip = hcloud_floating_ip.control_plane.ip_address + }) + ] + + depends_on = [ + hcloud_server.control_plane, + hcloud_floating_ip_assignment.control_plane + ] +} + +# Worker configuration +data "talos_machine_configuration" "worker" { + count = 1 + cluster_name = local.cluster_name + machine_type = "worker" + cluster_endpoint = "https://${hcloud_floating_ip.control_plane.ip_address}:6443" + machine_secrets = talos_machine_secrets.this.machine_secrets + talos_version = var.talos_version + + config_patches = [ + templatefile("${path.module}/talos-patches/worker.yaml", { + cluster_name = local.cluster_name + node_name = hcloud_server.worker[count.index].name + }) + ] + + depends_on = [ + hcloud_server.worker, + hcloud_floating_ip_assignment.control_plane + ] +} + +############################################################ +# APPLY TALOS CONFIGURATION +############################################################ + +resource "talos_machine_configuration_apply" "control_plane" { + count = 3 + client_configuration = talos_machine_secrets.this.client_configuration + machine_configuration_input = data.talos_machine_configuration.control_plane[count.index].machine_configuration + endpoint = hcloud_server.control_plane[count.index].ipv4_address + node = hcloud_server.control_plane[count.index].ipv4_address + + depends_on = [ + hcloud_server_network.control_plane, + data.talos_machine_configuration.control_plane + ] +} + +resource "talos_machine_configuration_apply" "worker" { + count = 1 + client_configuration = talos_machine_secrets.this.client_configuration + machine_configuration_input = data.talos_machine_configuration.worker[count.index].machine_configuration + endpoint = hcloud_server.worker[count.index].ipv4_address + node = hcloud_server.worker[count.index].ipv4_address + + depends_on = [ + hcloud_server_network.worker, + data.talos_machine_configuration.worker, + talos_machine_configuration_apply.control_plane + ] +} + +############################################################ +# BOOTSTRAP KUBERNETES +############################################################ + +resource "talos_machine_bootstrap" "this" { + client_configuration = talos_machine_secrets.this.client_configuration + node = hcloud_server.control_plane[0].ipv4_address + + depends_on = [ + talos_machine_configuration_apply.control_plane, + talos_machine_configuration_apply.worker + ] +} + +############################################################ +# GET KUBECONFIG +############################################################ + +resource "talos_cluster_kubeconfig" "this" { + client_configuration = talos_machine_secrets.this.client_configuration + node = hcloud_server.control_plane[0].ipv4_address + + depends_on = [talos_machine_bootstrap.this] +} + +############################################################ +# SAVE CONFIGURATIONS +############################################################ + +resource "local_sensitive_file" "kubeconfig" { + # Replace the internal hostname with the floating IP for external access + content = replace( + talos_cluster_kubeconfig.this.kubeconfig_raw, + "https://${local.cluster_name}.local:6443", + "https://${hcloud_floating_ip.control_plane.ip_address}:6443" + ) + filename = "${path.root}/kubeconfig" +} + +resource "local_sensitive_file" "talosconfig" { + content = data.talos_client_configuration.this.talos_config + filename = "${path.root}/talosconfig" +} \ No newline at end of file diff --git a/aula-08/outputs.tf b/aula-08/outputs.tf new file mode 100644 index 0000000..736647d --- /dev/null +++ b/aula-08/outputs.tf @@ -0,0 +1,153 @@ +############################################################ +# Outputs for Hetzner Talos Kubernetes Cluster +############################################################ + +# Cluster Information +output "cluster_name" { + description = "The name of the Kubernetes cluster" + value = local.cluster_name +} + +output "cluster_id" { + description = "The unique identifier for the cluster" + value = random_string.cluster_id.result +} + +# Network Information +output "network_id" { + description = "The ID of the cluster's private network" + value = hcloud_network.cluster.id +} + +output "network_cidr" { + description = "The CIDR range of the cluster network" + value = hcloud_network_subnet.cluster.ip_range +} + +# Control Plane Information +output "control_plane_ip" { + description = "Public IP address of the control plane" + value = hcloud_floating_ip.control_plane.ip_address +} + +output "control_plane_private_ips" { + description = "Private IP addresses of control plane nodes" + value = [for cp in hcloud_server_network.control_plane : cp.ip] +} + +output "control_plane_ids" { + description = "Server IDs of control plane nodes" + value = [for cp in hcloud_server.control_plane : cp.id] +} + +# Worker Nodes Information +output "worker_ips" { + description = "Public IP addresses of worker nodes" + value = [for w in hcloud_server.worker : w.ipv4_address] +} + +output "worker_private_ips" { + description = "Private IP addresses of worker nodes" + value = [for w in hcloud_server_network.worker : w.ip] +} + +output "worker_ids" { + description = "Server IDs of worker nodes" + value = [for w in hcloud_server.worker : w.id] +} + +# Kubernetes Access +output "kubeconfig_path" { + description = "Path to the generated kubeconfig file" + value = local_sensitive_file.kubeconfig.filename +} + +output "talosconfig_path" { + description = "Path to the generated talosconfig file" + value = local_sensitive_file.talosconfig.filename +} + +# API Endpoints +output "kubernetes_api_endpoint" { + description = "Kubernetes API server endpoint" + value = "https://${hcloud_floating_ip.control_plane.ip_address}:6443" +} + +output "talos_api_endpoint" { + description = "Talos API endpoint for management" + value = "https://${hcloud_floating_ip.control_plane.ip_address}:50000" +} + +# Cost Information +output "estimated_monthly_cost" { + description = "Estimated monthly cost for the infrastructure (EUR)" + value = { + control_plane = 3 * 3.79 # 3x CAX11 + worker = 1 * 3.79 # 1x CAX11 + floating_ip = 3.00 # Floating IPv4 + total = (4 * 3.79) + 3.00 # ~€18.16 + } +} + +# Connection Instructions +output "connection_instructions" { + description = "Instructions for connecting to the cluster" + value = <<-EOT + + ==================================== + Kubernetes Cluster Ready! + ==================================== + + 1. Configure kubectl: + export KUBECONFIG=${local_sensitive_file.kubeconfig.filename} + kubectl get nodes + + 2. Configure talosctl: + export TALOSCONFIG=${local_sensitive_file.talosconfig.filename} + talosctl --nodes ${hcloud_floating_ip.control_plane.ip_address} health + + 3. Access Kubernetes API: + ${"https://${hcloud_floating_ip.control_plane.ip_address}:6443"} + + 4. Nodes: + Control Plane: 3x CAX11 (ARM64) + Workers: 1x CAX11 (ARM64) + + 5. Total Monthly Cost: ~€18/month + + ==================================== + EOT +} + +# Cluster Autoscaler Configuration +output "autoscaler_worker_config" { + description = "Worker machine config for cluster autoscaler (base64)" + value = base64encode(data.talos_machine_configuration.worker[0].machine_configuration) + sensitive = true +} + +output "autoscaler_image_id" { + description = "Talos image ID for cluster autoscaler" + value = var.talos_image_id +} + +# Resource Labels +output "resource_labels" { + description = "Labels applied to all resources" + value = local.common_labels +} + +# Firewall Information +output "firewall_id" { + description = "ID of the firewall protecting the cluster" + value = hcloud_firewall.cluster.id +} + +# SSH Key Information (for autoscaler) +output "ssh_key_name" { + description = "Name of the SSH key used by the cluster" + value = length(local.ssh_key_matches) > 0 ? [ + for key in data.hcloud_ssh_keys.all.ssh_keys : key.name + if key.id == local.ssh_key_matches[0] + ][0] : "${local.cluster_name}-admin" +} diff --git a/aula-08/setup.sh b/aula-08/setup.sh new file mode 100755 index 0000000..3c46f1d --- /dev/null +++ b/aula-08/setup.sh @@ -0,0 +1,361 @@ +#!/bin/bash + +############################################################ +# Aula 08 - OpenTofu + Talos + Hetzner Cloud +# Provisiona cluster Kubernetes Talos em HA +############################################################ + +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Diretório do script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Funções de log +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +############################################################ +# VERIFICAÇÃO DE PRÉ-REQUISITOS +############################################################ + +echo "" +echo "============================================" +echo " Aula 08 - Cluster Talos via OpenTofu" +echo "============================================" +echo "" + +log_info "Verificando pré-requisitos..." + +# Verificar OpenTofu +if ! command -v tofu &> /dev/null; then + log_error "OpenTofu não encontrado!" + echo "" + echo "Instale o OpenTofu:" + echo " brew install opentofu # macOS" + echo " snap install opentofu # Linux" + echo "" + echo "Mais info: https://opentofu.org/docs/intro/install/" + exit 1 +fi +log_success "OpenTofu $(tofu version | head -1)" + +# Verificar talosctl +if ! command -v talosctl &> /dev/null; then + log_error "talosctl não encontrado!" + echo "" + echo "Instale o talosctl:" + echo " brew install siderolabs/tap/talosctl # macOS" + echo " curl -sL https://talos.dev/install | sh # Linux" + echo "" + exit 1 +fi +log_success "talosctl $(talosctl version --client 2>/dev/null | grep 'Client' | awk '{print $2}' || echo 'instalado')" + +# Verificar kubectl +if ! command -v kubectl &> /dev/null; then + log_error "kubectl não encontrado!" + echo "" + echo "Instale o kubectl:" + echo " brew install kubectl # macOS" + echo " snap install kubectl # Linux" + echo "" + exit 1 +fi +log_success "kubectl $(kubectl version --client -o yaml 2>/dev/null | grep gitVersion | awk '{print $2}' || echo 'instalado')" + +# Verificar hcloud CLI (opcional, mas útil) +if command -v hcloud &> /dev/null; then + log_success "hcloud CLI instalado" +else + log_warn "hcloud CLI não instalado (opcional)" + echo " Para listar imagens: brew install hcloud" +fi + +echo "" + +############################################################ +# COLETA DE CREDENCIAIS +############################################################ + +# Verificar se terraform.tfvars já existe +if [ -f "terraform.tfvars" ]; then + log_warn "terraform.tfvars já existe!" + read -p "Deseja sobrescrever? (s/N): " overwrite + if [[ ! "$overwrite" =~ ^[Ss]$ ]]; then + log_info "Usando terraform.tfvars existente" + SKIP_CREDENTIALS=true + fi +fi + +if [ "$SKIP_CREDENTIALS" != "true" ]; then + echo "============================================" + echo " Configuração de Credenciais" + echo "============================================" + echo "" + + # Token Hetzner + echo "1. Token da API Hetzner Cloud" + echo " Obtenha em: https://console.hetzner.cloud/projects/*/security/tokens" + echo "" + read -sp " Digite o token: " HCLOUD_TOKEN + echo "" + + if [ -z "$HCLOUD_TOKEN" ]; then + log_error "Token não pode ser vazio!" + exit 1 + fi + log_success "Token configurado" + echo "" + + # SSH Key + echo "2. Chave SSH pública" + DEFAULT_SSH_KEY="$HOME/.ssh/id_rsa.pub" + if [ -f "$DEFAULT_SSH_KEY" ]; then + echo " Encontrada: $DEFAULT_SSH_KEY" + read -p " Usar esta chave? (S/n): " use_default + if [[ ! "$use_default" =~ ^[Nn]$ ]]; then + SSH_PUBLIC_KEY=$(cat "$DEFAULT_SSH_KEY") + fi + fi + + if [ -z "$SSH_PUBLIC_KEY" ]; then + read -p " Caminho da chave pública: " ssh_path + if [ -f "$ssh_path" ]; then + SSH_PUBLIC_KEY=$(cat "$ssh_path") + else + log_error "Arquivo não encontrado: $ssh_path" + exit 1 + fi + fi + log_success "Chave SSH configurada" + echo "" + + # ID da imagem Talos + echo "3. ID da imagem Talos (snapshot da aula-07)" + echo " Para listar: hcloud image list --type snapshot" + echo "" + read -p " Digite o ID da imagem: " TALOS_IMAGE_ID + + if [ -z "$TALOS_IMAGE_ID" ]; then + log_error "ID da imagem não pode ser vazio!" + exit 1 + fi + + # Validar que é número + if ! [[ "$TALOS_IMAGE_ID" =~ ^[0-9]+$ ]]; then + log_error "ID deve ser um número!" + exit 1 + fi + log_success "Image ID: $TALOS_IMAGE_ID" + echo "" + + # Criar terraform.tfvars + log_info "Criando terraform.tfvars..." + cat > terraform.tfvars << EOF +# Gerado automaticamente por setup.sh +# $(date) + +hcloud_token = "$HCLOUD_TOKEN" +ssh_public_key = "$SSH_PUBLIC_KEY" +talos_image_id = $TALOS_IMAGE_ID + +environment = "workshop" +enable_monitoring = true +EOF + log_success "terraform.tfvars criado" +fi + +echo "" + +############################################################ +# INICIALIZAÇÃO DO OPENTOFU +############################################################ + +echo "============================================" +echo " Inicializando OpenTofu" +echo "============================================" +echo "" + +log_info "Executando tofu init..." +tofu init + +log_success "OpenTofu inicializado" +echo "" + +############################################################ +# PLANEJAMENTO +############################################################ + +echo "============================================" +echo " Planejando Infraestrutura" +echo "============================================" +echo "" + +log_info "Executando tofu plan..." +tofu plan -out=tfplan + +echo "" +log_success "Plano criado!" +echo "" + +# Mostrar resumo +echo "============================================" +echo " Recursos a serem criados:" +echo "============================================" +echo "" +echo " - 4x CAX11 (3 CP + 1 Worker) = 4 x €3.79 = €15.16" +echo " - 1x Floating IPv4 = €3.00" +echo " - Rede/Firewall/Placement = Grátis" +echo "" +echo " Custo estimado: ~€18.16/mês (sem VAT)" +echo "" + +############################################################ +# APLICAÇÃO +############################################################ + +read -p "Deseja aplicar o plano? (s/N): " apply +if [[ ! "$apply" =~ ^[Ss]$ ]]; then + log_warn "Operação cancelada pelo usuário" + echo "" + echo "Para aplicar manualmente:" + echo " tofu apply tfplan" + echo "" + exit 0 +fi + +echo "" +log_info "Aplicando infraestrutura..." +echo "" + +tofu apply tfplan + +echo "" +log_success "Infraestrutura provisionada!" +echo "" + +############################################################ +# CONFIGURAÇÃO PÓS-DEPLOY +############################################################ + +echo "============================================" +echo " Configuração Pós-Deploy" +echo "============================================" +echo "" + +# Aguardar cluster ficar pronto +log_info "Aguardando cluster Talos ficar pronto..." +sleep 10 + +# Configurar talosctl +if [ -f "talosconfig" ]; then + log_info "Configurando talosctl..." + export TALOSCONFIG="$SCRIPT_DIR/talosconfig" + + # Obter IP do control plane + CP_IP=$(tofu output -raw control_plane_ip 2>/dev/null || echo "") + + if [ -n "$CP_IP" ]; then + log_info "Aguardando API do Talos em $CP_IP..." + + # Tentar health check (pode demorar alguns minutos) + for i in {1..30}; do + if talosctl --talosconfig talosconfig -n "$CP_IP" health --wait-timeout 10s 2>/dev/null; then + log_success "Cluster Talos saudável!" + break + fi + echo -n "." + sleep 10 + done + echo "" + fi +fi + +# Configurar kubectl +if [ -f "kubeconfig" ]; then + log_info "Configurando kubectl..." + export KUBECONFIG="$SCRIPT_DIR/kubeconfig" + + log_info "Aguardando nodes ficarem Ready..." + for i in {1..30}; do + if kubectl get nodes 2>/dev/null | grep -q "Ready"; then + log_success "Nodes prontos!" + kubectl get nodes + break + fi + echo -n "." + sleep 10 + done + echo "" +fi + +echo "" + +############################################################ +# RESUMO FINAL +############################################################ + +echo "============================================" +echo " Cluster Provisionado com Sucesso!" +echo "============================================" +echo "" + +# Mostrar outputs +echo "Endpoints:" +tofu output -raw kubernetes_api_endpoint 2>/dev/null && echo "" || true +tofu output -raw talos_api_endpoint 2>/dev/null && echo "" || true +echo "" + +echo "Arquivos gerados:" +echo " - kubeconfig : Configuração do kubectl" +echo " - talosconfig : Configuração do talosctl" +echo "" + +echo "Comandos úteis:" +echo "" +echo " # Usar kubectl com este cluster" +echo " export KUBECONFIG=$SCRIPT_DIR/kubeconfig" +echo " kubectl get nodes" +echo "" +echo " # Usar talosctl com este cluster" +echo " export TALOSCONFIG=$SCRIPT_DIR/talosconfig" +echo " talosctl -n health" +echo "" +echo " # Ver outputs do OpenTofu" +echo " tofu output" +echo "" +echo " # Destruir infraestrutura (CUIDADO!)" +echo " ./cleanup.sh" +echo "" + +log_success "Setup concluído!" + +echo "" +echo "============================================" +echo " Próximo passo (opcional)" +echo "============================================" +echo "" +echo " Para habilitar autoscaling de 1-5 workers:" +echo " ./install-autoscaler.sh" +echo "" diff --git a/aula-08/talos-patches/control-plane.yaml b/aula-08/talos-patches/control-plane.yaml new file mode 100644 index 0000000..72730b3 --- /dev/null +++ b/aula-08/talos-patches/control-plane.yaml @@ -0,0 +1,63 @@ +# Talos Control Plane Configuration Patch +# Base configuration for HA control plane +machine: + # Network configuration for Floating IP + network: + interfaces: + - interface: eth0 + dhcp: true +%{ if is_first_cp ~} + addresses: + - ${floating_ip}/32 +%{ endif ~} + + # Network optimizations + sysctls: + net.core.somaxconn: "8192" + net.ipv4.tcp_max_syn_backlog: "8192" + net.core.netdev_max_backlog: "5000" + net.ipv4.ip_local_port_range: "1024 65535" + net.ipv4.tcp_tw_reuse: "1" + net.ipv4.tcp_fin_timeout: "15" + fs.file-max: "2097152" + fs.inotify.max_user_watches: "524288" + vm.max_map_count: "262144" + + # Kubelet configuration + kubelet: + extraArgs: + max-pods: "110" + kube-reserved: "cpu=200m,memory=300Mi" + system-reserved: "cpu=200m,memory=200Mi" + + # Time sync + time: + servers: + - ntp1.hetzner.de + - ntp2.hetzner.com + - ntp3.hetzner.net + + # Features + features: + rbac: true + stableHostname: true + +cluster: + # Control plane configuration + controlPlane: + endpoint: https://${floating_ip}:6443 + + # Network configuration + network: + cni: + name: flannel + dnsDomain: cluster.local + serviceSubnets: + - 10.96.0.0/12 + podSubnets: + - 10.244.0.0/16 + + # Etcd configuration for HA + etcd: + advertisedSubnets: + - 10.0.1.0/24 \ No newline at end of file diff --git a/aula-08/talos-patches/worker.yaml b/aula-08/talos-patches/worker.yaml new file mode 100644 index 0000000..b5c0d24 --- /dev/null +++ b/aula-08/talos-patches/worker.yaml @@ -0,0 +1,44 @@ +# Talos Worker Configuration Patch +# Base configuration for worker nodes +machine: + # Network optimizations + sysctls: + net.core.somaxconn: "8192" + net.ipv4.tcp_max_syn_backlog: "8192" + net.core.netdev_max_backlog: "5000" + net.ipv4.ip_local_port_range: "1024 65535" + net.ipv4.tcp_tw_reuse: "1" + net.ipv4.tcp_fin_timeout: "15" + fs.file-max: "2097152" + fs.inotify.max_user_watches: "524288" + vm.max_map_count: "262144" + + # Kubelet configuration + kubelet: + extraArgs: + max-pods: "110" + kube-reserved: "cpu=100m,memory=200Mi" + system-reserved: "cpu=100m,memory=100Mi" + + # Time sync + time: + servers: + - ntp1.hetzner.de + - ntp2.hetzner.com + - ntp3.hetzner.net + + # Features + features: + rbac: true + stableHostname: true + +cluster: + # Network configuration + network: + cni: + name: flannel + dnsDomain: cluster.local + serviceSubnets: + - 10.96.0.0/12 + podSubnets: + - 10.244.0.0/16 \ No newline at end of file diff --git a/aula-08/terraform.tfvars.example b/aula-08/terraform.tfvars.example new file mode 100644 index 0000000..5a18add --- /dev/null +++ b/aula-08/terraform.tfvars.example @@ -0,0 +1,53 @@ +# Exemplo de arquivo terraform.tfvars +# Copie este arquivo para terraform.tfvars e preencha com seus valores + +# ============================================ +# CREDENCIAIS (OBRIGATÓRIO) +# ============================================ + +# Token da API Hetzner Cloud +# Obtenha em: https://console.hetzner.cloud/projects/[PROJECT_ID]/security/tokens +hcloud_token = "seu_token_hetzner_aqui" + +# Chave SSH pública para acesso emergencial aos nodes +# Obtenha com: cat ~/.ssh/id_rsa.pub +ssh_public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC... seu@email.com" + +# ID da imagem Talos customizada (criada na aula-07) +# Obtenha com: hcloud image list --type snapshot +talos_image_id = 123456789 + +# ============================================ +# CONFIGURAÇÃO DO CLUSTER +# ============================================ + +# Ambiente (prod, staging, dev) +environment = "workshop" + +# Versão do Talos OS (opcional - default: v1.11.2) +# talos_version = "v1.11.2" + +# ============================================ +# MONITORAMENTO +# ============================================ + +# Habilitar Victoria Metrics +enable_monitoring = true + +# ============================================ +# AUTO-SCALING +# ============================================ + +# Thresholds de CPU para scaling +scale_up_threshold = 70 # Escala quando CPU > 70% +scale_down_threshold = 30 # Reduz quando CPU < 30% + +# ============================================ +# LABELS CUSTOMIZADAS (OPCIONAL) +# ============================================ + +# Labels adicionais para todos os recursos +custom_labels = { + projeto = "k8s-base" + responsavel = "devops" +} \ No newline at end of file diff --git a/aula-08/test-autoscaler.yaml b/aula-08/test-autoscaler.yaml new file mode 100644 index 0000000..d712a6a --- /dev/null +++ b/aula-08/test-autoscaler.yaml @@ -0,0 +1,38 @@ +############################################################ +# Deployment de teste para o Cluster Autoscaler +# Cria pods que consomem recursos para forçar scale-up +############################################################ + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: test-autoscaler + namespace: default +spec: + replicas: 10 + selector: + matchLabels: + app: test-autoscaler + template: + metadata: + labels: + app: test-autoscaler + spec: + containers: + - name: nginx + image: nginx:alpine + resources: + requests: + cpu: 400m # Cada pod pede 0.4 CPU + memory: 512Mi # Cada pod pede 512MB RAM + limits: + cpu: 500m + memory: 640Mi + # Evita que pods rodem nos control-planes + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist diff --git a/aula-08/variables.tf b/aula-08/variables.tf new file mode 100644 index 0000000..726f6b2 --- /dev/null +++ b/aula-08/variables.tf @@ -0,0 +1,63 @@ +############################################################ +# Variables for Hetzner Talos Kubernetes Cluster +############################################################ + +# Authentication +variable "hcloud_token" { + type = string + description = "Hetzner Cloud API token" + sensitive = true +} + +# Cluster Configuration +variable "environment" { + type = string + description = "Environment name (prod, staging, dev)" + default = "prod" +} + + +# SSH Configuration +variable "ssh_public_key" { + type = string + description = "Public SSH key for emergency access to nodes" +} + +# Talos Configuration +variable "talos_image_id" { + type = number + description = "ID da imagem Talos customizada na Hetzner (criada na aula-07). Obtenha com: hcloud image list --type snapshot" +} + +variable "talos_version" { + type = string + description = "Talos version to use" + default = "v1.11.2" # Match the official image version +} + +# Monitoring Configuration +variable "enable_monitoring" { + type = bool + description = "Enable Victoria Metrics monitoring stack" + default = true +} + +# Auto-scaling Configuration +variable "scale_up_threshold" { + type = number + description = "CPU percentage to trigger scale up" + default = 70 +} + +variable "scale_down_threshold" { + type = number + description = "CPU percentage to trigger scale down" + default = 30 +} + +# Tags for resource management +variable "custom_labels" { + type = map(string) + description = "Custom labels to add to all resources" + default = {} +} diff --git a/aula-08/versions.tf b/aula-08/versions.tf new file mode 100644 index 0000000..5ecbd3b --- /dev/null +++ b/aula-08/versions.tf @@ -0,0 +1,35 @@ +############################################################ +# OpenTofu Version and Provider Requirements +# Compatible with OpenTofu >= 1.6.0 +############################################################ + +terraform { + required_version = ">= 1.6.0" + + required_providers { + hcloud = { + source = "hetznercloud/hcloud" + version = "~> 1.45" + } + + talos = { + source = "siderolabs/talos" + version = "0.6.0" + } + + random = { + source = "hashicorp/random" + version = "~> 3.5" + } + + null = { + source = "hashicorp/null" + version = "~> 3.2" + } + + local = { + source = "hashicorp/local" + version = "~> 2.4" + } + } +} \ No newline at end of file